コード例 #1
0
ファイル: tests_plugin_pdf.py プロジェクト: turicas/rows
    def test_real_data_3(self):
        filename = "tests/data/eleicoes-tcesp-161-162.pdf"
        expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format(self.backend)
        expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format(self.backend)
        begin = re.compile("Documento gerado em.*")
        end = re.compile("Página: [0-9]+ de.*")

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(1,),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected1)
        self.assertEqual(list(expected), list(result))

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(2,),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected2)
        self.assertEqual(list(expected), list(result))
コード例 #2
0
    def test_real_data_3(self):
        filename = "tests/data/eleicoes-tcesp-161-162.pdf"
        expected1 = "tests/data/expected-eleicoes-tcesp-161-{}.csv".format(
            self.backend)
        expected2 = "tests/data/expected-eleicoes-tcesp-162-{}.csv".format(
            self.backend)
        begin = re.compile("Documento gerado em.*")
        end = re.compile("Página: [0-9]+ de.*")

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(1, ),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected1)
        self.assertEqual(list(expected), list(result))

        result = rows.import_from_pdf(
            filename,
            backend=self.backend,
            page_numbers=(2, ),
            starts_after=begin,
            ends_before=end,
            algorithm="header-position",
        )
        expected = rows.import_from_csv(expected2)
        self.assertEqual(list(expected), list(result))
コード例 #3
0
    def parse_pdf(self, response):
        pdf = rows.plugins.pdf.PyMuPDFBackend(io.BytesIO(response.body))
        pages = pdf.text_objects(starts_after=re.compile("EM INVESTIGAÇÃO.*"))
        for page in pages:
            for obj in page:
                if obj.text.startswith("Fonte:"):
                    day, month, year = re.compile(
                        "([0-9]{2})/([0-9]{2})/([0-9]{4})"
                    ).findall(obj.text)[0]
                    date = datetime.date(int(year), int(month), int(day))
                    break
        self.add_report(date=date, url=response.url)

        table = rows.import_from_pdf(
            io.BytesIO(response.body),
            starts_after=re.compile("DADOS DETALHADOS POR MUNICÍPIO DE RESIDÊNCIA.*"),
            ends_before=re.compile("Fonte:"),
        )
        confirmed_cases = {}
        for row in table:
            city = convert_city(row.municipio_de_residencia)
            if city is None:
                continue
            confirmed = row.casos_confirmados_incidencia_por_n_100_ooo_hab.splitlines()[
                0
            ]
            if confirmed in ("-", ""):
                confirmed = None
            else:
                confirmed = int(confirmed)
            confirmed_cases[city] = confirmed

        table = rows.import_from_pdf(
            io.BytesIO(response.body),
            starts_after=re.compile("EM INVESTIGAÇÃO.*"),
            ends_before=re.compile("Fonte:"),
        )
        deaths_cases = {}
        for row in table:
            city = convert_city(row.field_0)
            if city is None:
                continue
            deaths_cases[city] = int(row.confirmado)

        cities = set(confirmed_cases.keys()) | set(deaths_cases.keys())
        for city in cities:
            confirmed = confirmed_cases.get(city, None)
            deaths = deaths_cases.get(city, None)
            if confirmed is None and deaths is None:
                continue
            confirmed = confirmed or 0
            deaths = deaths or 0
            if confirmed == 0 and deaths == 0:
                continue
            if city == "TOTAL NO ESTADO":
                self.add_state_case(confirmed=confirmed, deaths=deaths)
            else:
                self.add_city_case(city=city, confirmed=confirmed, deaths=deaths)
コード例 #4
0
    def extract(self):
        filename, metadata = self.filename, self.metadata
        extension = filename.name.split(".")[-1].lower()

        # From 2017-11 to 2018-12, get data from Brasil.IO (in CSV) using the
        # `extract_magistrados` helper function.
        if "contracheque.csv" in filename.name:
            yield from extract_magistrados(filename, self.state)

        elif extension == "pdf":
            if metadata["ano"] == 2018 or (metadata["ano"] == 2017
                                           and metadata["mes"] in (11, 12)):
                # Data already converted in contracheque.csv
                return

            total_pages = rows.plugins.pdf.number_of_pages(self.filename)
            for page in range(1, total_pages + 1):
                table = rows.import_from_pdf(
                    self.filename,
                    page_numbers=(page, ),
                    fields=self.fields,
                    skip_header=page == 1,
                )
                for row in table:
                    yield {
                        "cargo": row.cargo,
                        "nome": row.nome.replace("\n", " ").strip(),
                        "rendimento_bruto": row.total_de_rendimentos,
                        "rendimento_liquido": row.rendimento_liquido,
                    }
コード例 #5
0
def extract_table(filename_or_fobj):
    total_pages = rows.plugins.pdf.number_of_pages(filename_or_fobj, backend="pymupdf")
    result = []
    for page_number in range(1, total_pages + 1):
        page_text = next(
            rows.plugins.pdf.pdf_to_text(
                filename_or_fobj, page_numbers=(page_number,), backend="pymupdf"
            )
        )
        page_meta = extract_page_metadata(page_text)
        if page_meta is None:  # Empty PDF
            return None

        table = rows.import_from_pdf(
            filename_or_fobj,
            page_numbers=(page_number,),
            backend="pymupdf",
            algorithm=YGroupsXPositionAlgorithm,
            fields=FIELDS,
            skip_header=False,
            starts_after=starts_after,
            ends_before=ends_before,
        )
        for row in table:
            if list(row._asdict().values()).count("") > 3:  # Empty line
                continue
            row = convert_row(row)
            row.update(page_meta)
            result.append(row)

    return result
コード例 #6
0
def extrai_tabela(url):
    url_final = f"http://www.imea.com.br/upload/publicacoes/arquivos/{url}"
    response = requests.get(url_final)
    return rows.import_from_pdf(
        io.BytesIO(response.content),
        ends_before=re.compile(r'\* ?Variação em .*'),
    )
コード例 #7
0
def parse_pdf(filename, meta):
    # Extract update date
    pdf_doc = PyMuPDFBackend(filename)
    update_date = None
    for page in pdf_doc.objects():
        for obj in page:
            if REGEXP_UPDATE.match(obj.text):
                update_date = PtBrDateField.deserialize(
                    REGEXP_UPDATE.findall(obj.text)[0])
                break
    if update_date is None:  # String not found in PDF
        # Parse URL to get date inside PDF's filename
        date = (meta["boletim_url"].split("/")[-1].split(".pdf")[0].replace(
            "CORONA_", "").split("_")[0])
        update_date = PtBrDateField2.deserialize(date)

    # Extract rows and inject update date and metadata
    table = rows.import_from_pdf(filename, backend="min-x0")
    for row in table:
        if row.municipio == "TOTAL GERAL":
            continue
        row = row._asdict()
        row["data"] = update_date
        row.update(meta)
        yield convert_row(row)
コード例 #8
0
def extract_2015(filename):
    starts_after = re.compile(".*DE 13/11/2002")
    pages = range(1, rows.plugins.pdf.number_of_pages(filename) + 1)
    for page in tqdm(pages, desc=filename):
        table = rows.import_from_pdf(filename,
                                     page_numbers=(page, ),
                                     starts_after=starts_after)
        for row in table:
            yield convert_row_2015(row)
コード例 #9
0
ファイル: tests_plugin_pdf.py プロジェクト: turicas/rows
 def test_real_data_2(self):
     filename = "tests/data/milho-safra-2017"
     result = rows.import_from_pdf(
         filename + ".pdf",
         backend=self.backend,
         starts_after=re.compile("MILHO SAFRA 16/17: ACOMPANHAMENTO DE .*"),
         ends_before="*Variação em pontos percentuais.",
     )
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
コード例 #10
0
def extract_table(fobj):
    table = rows.import_from_pdf(fobj, backend="pymupdf")
    result = []
    for row in table:
        row = row._asdict()
        row["local_da_coleta"] = clean(row["local_da_coleta"])
        row["ponto_codigo"] = clean(row["ponto_codigo"])
        row["costa_ponto"] = extrai_costa(row["ponto_codigo"])
        result.append(row)
    return result
コード例 #11
0
 def test_real_data_2(self):
     filename = "tests/data/milho-safra-2017"
     result = rows.import_from_pdf(
         filename + ".pdf",
         backend=self.backend,
         starts_after=re.compile("MILHO SAFRA 16/17: ACOMPANHAMENTO DE .*"),
         ends_before="*Variação em pontos percentuais.",
     )
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
コード例 #12
0
 def test_rects_boundaries(self):
     filename = "tests/data/ibama-autuacao-amazonas-2010-pag2"
     result = rows.import_from_pdf(
         filename + ".pdf",
         backend=self.backend,
         starts_after=re.compile("DIRETORIA DE PROTE.*"),
         ends_before=re.compile("Pag [0-9]+/[0-9]+"),
         algorithm="rects-boundaries",
     )
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
コード例 #13
0
ファイル: tests_plugin_pdf.py プロジェクト: turicas/rows
 def test_rects_boundaries(self):
     filename = "tests/data/ibama-autuacao-amazonas-2010-pag2"
     result = rows.import_from_pdf(
         filename + ".pdf",
         backend=self.backend,
         starts_after=re.compile("DIRETORIA DE PROTE.*"),
         ends_before=re.compile("Pag [0-9]+/[0-9]+"),
         algorithm="rects-boundaries",
     )
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
コード例 #14
0
    def read_pdf(self, response):
        path = f"download/{Path(response.url).name}"
        self.logger.info("Saving PDF %s from %s", path, response.url)

        with open(path, "wb") as f:
            f.write(response.body)

        can_read = False
        data = {}
        city = None

        for row in rows.import_from_pdf(path):
            row = row._asdict()
            if (SUSPECTED in clean(row["field_1"])
                    and DISCARDED in clean(row["field_2"])
                    and CONFIRMED in clean(row["field_3"])):
                can_read = True
                continue

            if can_read:
                city = list(row.values())[0]
                if len(city.split("\n")) > 3:
                    cities, suspected, discarded, confirmed = row.values()
                    cities = clean_cities(cities)
                    discarded = discarded.split("\n")

                    suspected = list(
                        it.islice(suspected.split("\n"), 0, None, 2))
                    confirmed = list(
                        it.islice(confirmed.split("\n"), 0, None, 2))

                    dt = list(zip(cities, suspected, discarded, confirmed))
                    for (cit, susp, disc, conf) in dt:
                        yield {
                            "municipio": cit,
                            "suspeitos": change_format(susp),
                            "descartados": change_format(disc),
                            "confirmados": change_format(conf),
                        }
                else:
                    city = clean(city)
                    data["municipio"] = city
                    data["suspeitos"] = row["field_1"].split("\n")[0]
                    data["descartados"] = row["field_2"]
                    data["confirmados"] = row["field_3"].split("\n")[0]
                yield data

            if city and any(city in text for text in tokens):
                break
コード例 #15
0
def parse_file(filename):
    """Parse Amazonas' PDF file containing state employee information"""

    total_pages = rows.plugins.pdf.number_of_pages(filename)
    result = []
    for page in range(1, total_pages + 1):
        table = rows.import_from_pdf(
            filename,
            page_numbers=(page, ),
            starts_after="NOME",
            fields=PDF_FIELD_TYPES,
            skip_header=True,
        )
        for row in table:
            result.append(convert_row(row))

    return rows.import_from_dicts(result)
コード例 #16
0
    def parse(self, response):
        meta = response.request.meta.copy()
        filename = meta['filename']
        meta['costa_menu'] = meta['costa']
        del_keys = [
            key for key in meta.keys()
            if key.startswith('download_') or key in ('url', 'filename',
                                                      'depth', 'costa')
        ]
        for key in del_keys:
            del meta[key]
        with open(filename, mode='wb') as fobj:
            fobj.write(response.body)

        for row in rows.import_from_pdf(io.BytesIO(response.body)):
            row = row._asdict()
            row['local_da_coleta'] = clean(row['local_da_coleta'])
            row['ponto_codigo'] = clean(row['ponto_codigo'])
            row['costa_ponto'] = extrai_costa(row['ponto_codigo'])
            row.update(meta)
            yield row
コード例 #17
0
 def test_real_data_1(self):
     filename = "tests/data/balneabilidade-26-2010"
     result = rows.import_from_pdf(filename + ".pdf", backend=self.backend)
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
コード例 #18
0
import rows

file = rows.import_from_pdf('samples/xp-2.pdf')

print(file.fields)
コード例 #19
0
ファイル: extract_pdf.py プロジェクト: turicas/rows
import requests

import rows

url = "http://balneabilidade.inema.ba.gov.br/index.php/relatoriodebalneabilidade/geraBoletim?idcampanha=42041"
print("*** Downloading PDF...")
response = requests.get(url)

# The line below will automatically identify the table in all PDF pages - it
# works for this file but not for all cases. You can be more specific defining
# the page numbers, a start/end string (like the header/footer strings) and
# also change the table identification algorithm. Check `backend`, `algorithm`,
# `starts_after`, `ends_before` and `page_numbers` parameters.
# For this simple case you could also install rows' CLI (`pip install
# rows[cli]`) and run: `rows print <url>`
table = rows.import_from_pdf(io.BytesIO(response.content))
rows.export_to_csv(table, "beach-data.csv")
print("*** Table exported to beach-data.csv")

print("*** Extracted table:")
print(rows.export_to_txt(table))

# You could also iterate over the object, like:
# for row in table: print(row)


print("\n\n*** Extracted text:")
text_pages = rows.plugins.pdf.pdf_to_text(io.BytesIO(response.content))
print("\n\n".join(text_pages))
コード例 #20
0
ファイル: tests_plugin_pdf.py プロジェクト: turicas/rows
 def test_real_data_1(self):
     filename = "tests/data/balneabilidade-26-2010"
     result = rows.import_from_pdf(filename + ".pdf", backend=self.backend)
     expected = rows.import_from_csv(filename + ".csv")
     self.assertEqual(list(expected), list(result))
コード例 #21
0
import io

import requests

import rows

url = "http://balneabilidade.inema.ba.gov.br/index.php/relatoriodebalneabilidade/geraBoletim?idcampanha=42041"
print("*** Downloading PDF...")
response = requests.get(url)

# The line below will automatically identify the table in all PDF pages - it
# works for this file but not for all cases. You can be more specific defining
# the page numbers, a start/end string (like the header/footer strings) and
# also change the table identification algorithm. Check `backend`, `algorithm`,
# `starts_after`, `ends_before` and `page_numbers` parameters.
# For this simple case you could also install rows' CLI (`pip install
# rows[cli]`) and run: `rows print <url>`
table = rows.import_from_pdf(io.BytesIO(response.content))
rows.export_to_csv(table, "beach-data.csv")
print("*** Table exported to beach-data.csv")

print("*** Extracted table:")
print(rows.export_to_txt(table))

# You could also iterate over the object, like:
# for row in table: print(row)

print("\n\n*** Extracted text:")
text_pages = rows.plugins.pdf.pdf_to_text(io.BytesIO(response.content))
print("\n\n".join(text_pages))