Ejemplo n.º 1
0
def download_organizations():
    "Download organizations JSON and extract its properties"

    response = requests.get(URL)
    data = response.json()
    organizations = [organization["properties"] for organization in data["features"]]
    return rows.import_from_dicts(organizations)
Ejemplo n.º 2
0
    def test_import_from_dicts_return_desired_data(self):
        table = rows.import_from_dicts(self.data)

        self.assertEqual(len(table), 3)
        self.assertEqual(len(table.fields), 4)
        self.assertEqual(
            set(table.field_names), set(["ids", "name", "number", "other"])
        )
        self.assertEqual(table.fields["name"], rows.fields.TextField)
        self.assertEqual(table.fields["ids"], rows.fields.TextField)
        self.assertEqual(table.fields["number"], rows.fields.IntegerField)
        self.assertEqual(table.fields["other"], rows.fields.FloatField)

        self.assertEqual(table[0].name, "Álvaro")
        self.assertEqual(table[0].ids, "123")
        self.assertEqual(table[0].number, 3)
        self.assertEqual(table[0].other, None)
        self.assertEqual(table[1].name, "Test")
        self.assertEqual(table[1].ids, "456")
        self.assertEqual(table[1].number, None)
        self.assertEqual(table[1].other, None)
        self.assertEqual(table[2].name, "Python")
        self.assertEqual(table[2].ids, "123, 456")
        self.assertEqual(table[2].number, None)
        self.assertEqual(table[2].other, 3.14)
Ejemplo n.º 3
0
def download_organizations():
    'Download organizations JSON and extract its properties'

    page = urllib.urlopen(URL)
    data = json.loads(page.read())
    organizations = [organization['properties']
                     for organization in data['features']]
    return rows.import_from_dicts(organizations)
Ejemplo n.º 4
0
 def test_export_callback(self):
     table = rows.import_from_dicts([{"id": number} for number in range(10)])
     myfunc = mock.Mock()
     rows.export_to_sqlite(table, ":memory:", callback=myfunc, batch_size=3)
     self.assertEqual(myfunc.call_count, 4)
     self.assertEqual(
         [(x[0][0], x[0][1]) for x in myfunc.call_args_list],
         [(3, 3), (3, 6), (3, 9), (1, 10)],
     )
Ejemplo n.º 5
0
    def test_import_field_limit(self):
        temp = tempfile.NamedTemporaryFile(delete=False)
        filename = "{}.{}".format(temp.name, self.file_extension)
        self.files_to_delete.append(filename)

        table = rows.import_from_dicts([{"f1": "a" * 132000}])
        rows.export_to_csv(table, filename)

        # The following line must not raise the exception:
        # `_csv.Error: field larger than field limit (131072)`
        new = rows.import_from_csv(filename)
Ejemplo n.º 6
0
    def test_export_to_dicts(self):
        table = rows.import_from_dicts(self.data)
        result = rows.export_to_dicts(table)
        full_data = [
            {"name": "Álvaro", "ids": "123", "number": 3, "other": None},
            {"name": "Test", "ids": "456", "number": None, "other": None},
            {"name": "Python", "ids": "123, 456", "number": None, "other": 3.14},
        ]

        self.assertEqual(len(result), len(table))
        for expected, actual in zip(full_data, result):
            self.assertDictEqual(expected, actual)
Ejemplo n.º 7
0
    def test_import_from_dicts_accepts_generator(self):
        max_size = 1000
        samples = 200
        generator = utils.LazyDictGenerator(max_size)
        datagen = iter(generator)
        table = rows.import_from_dicts(datagen, lazy=True, samples=samples)
        # `create_table` will consume the whole generator
        self.assertEqual(generator.last, max_size - 1)

        data = list(table)
        self.assertTrue(len(data), max_size)
        self.assertEqual(generator.last, max_size - 1)
Ejemplo n.º 8
0
    def test_import_from_dicts_accepts_generator(self):
        max_size = 1000
        samples = 200
        generator = utils.LazyDictGenerator(max_size)
        datagen = iter(generator)
        table = rows.import_from_dicts(datagen, lazy=True, samples=samples)
        # `create_table` will consume the whole generator
        self.assertEqual(generator.last, max_size - 1)

        data = list(table)
        self.assertTrue(len(data), max_size)
        self.assertEqual(generator.last, max_size - 1)
Ejemplo n.º 9
0
    def test_import_from_dicts_maintains_header_order(self):
        headers = list(string.ascii_lowercase)
        random.shuffle(headers)

        data = [
            OrderedDict([(header, 1) for header in headers]),
            OrderedDict([(header, 2) for header in headers]),
            OrderedDict([(header, 3) for header in headers]),
            OrderedDict([(header, 4) for header in headers]),
            OrderedDict([(header, 5) for header in headers]),
        ]
        table = rows.import_from_dicts(data)
        self.assertEqual(table.field_names, headers)
    def test_import_from_dicts_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {'some_key': 123, 'other': 456, }

        result = rows.import_from_dicts(self.data, **kwargs)

        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs['meta'] = {'imported_from': 'dicts', }
        self.assertEqual(call[1], kwargs)
Ejemplo n.º 11
0
    def test_import_from_dicts_maintains_header_order(self):
        headers = list(string.ascii_lowercase)
        random.shuffle(headers)

        data = [
            OrderedDict([(header, 1) for header in headers]),
            OrderedDict([(header, 2) for header in headers]),
            OrderedDict([(header, 3) for header in headers]),
            OrderedDict([(header, 4) for header in headers]),
            OrderedDict([(header, 5) for header in headers]),
        ]
        table = rows.import_from_dicts(data)
        self.assertEqual(table.field_names, headers)
Ejemplo n.º 12
0
    def test_import_from_dicts_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {'some_key': 123, 'other': 456, }

        result = rows.import_from_dicts(self.data, **kwargs)

        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs['meta'] = {'imported_from': 'dicts', }
        self.assertEqual(call[1], kwargs)
Ejemplo n.º 13
0
    def execute(self, year, action):
        logging.info(f"[Budget-CE] Starting for {year} and action {action}")
        self.select_year(year)
        self.select_month(
            "Dezembro")  # December has the cumulative for the year
        self.select_action(action)
        self.select_modality_91("TUDO")
        self.select_report("Outros", "PA")
        filename = self.do_search()
        result = self.parse_budget(filename, year, action)

        for row in rows.import_from_dicts(result):
            yield row._asdict()
Ejemplo n.º 14
0
def extract_boletim(state, data):
    table = rows.import_from_dicts(
        data,
        force_types={
            "date": rows.fields.DateField,
            "notes": rows.fields.TextField,
            "state": rows.fields.TextField,
            "url": rows.fields.TextField,
        },
    )
    for row in table:
        row = row._asdict()
        yield row
Ejemplo n.º 15
0
    def test_import_from_dicts_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {"some_key": 123, "other": 456}

        result = rows.import_from_dicts(self.data, **kwargs)

        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs["meta"] = {"imported_from": "dicts"}
        kwargs["samples"] = None
        self.assertEqual(call[1], kwargs)
Ejemplo n.º 16
0
def csv_to_sqlite(
    input_filename,
    output_filename,
    samples=None,
    dialect=None,
    batch_size=10000,
    encoding="utf-8",
    callback=None,
    force_types=None,
    chunk_size=8388608,
    table_name="table1",
    schema=None,
):
    "Export a CSV file to SQLite, based on field type detection from samples"

    # TODO: automatically detect encoding if encoding == `None`
    # TODO: should be able to specify fields

    if dialect is None:  # Get a sample to detect dialect
        fobj = open_compressed(input_filename, mode="rb")
        sample = fobj.read(chunk_size)
        dialect = rows.plugins.csv.discover_dialect(sample, encoding=encoding)
    elif isinstance(dialect, six.text_type):
        dialect = csv.get_dialect(dialect)

    if schema is None:  # Identify data types
        fobj = open_compressed(input_filename, encoding=encoding)
        data = list(islice(csv.DictReader(fobj, dialect=dialect), samples))
        schema = rows.import_from_dicts(data).fields
        if force_types is not None:
            schema.update(force_types)

    # Create lazy table object to be converted
    # TODO: this lazyness feature will be incorported into the library soon so
    #       we can call here `rows.import_from_csv` instead of `csv.reader`.
    reader = csv.reader(
        open_compressed(input_filename, encoding=encoding), dialect=dialect
    )
    header = make_header(next(reader))  # skip header
    table = rows.Table(fields=OrderedDict([(field, schema[field]) for field in header]))
    table._rows = reader

    # Export to SQLite
    return rows.export_to_sqlite(
        table,
        output_filename,
        table_name=table_name,
        batch_size=batch_size,
        callback=callback,
    )
Ejemplo n.º 17
0
    def test_import_from_dicts_uses_create_table(self, mocked_create_table):
        mocked_create_table.return_value = 42
        kwargs = {"some_key": 123, "other": 456}

        result = rows.import_from_dicts(self.data, **kwargs)

        self.assertTrue(mocked_create_table.called)
        self.assertEqual(mocked_create_table.call_count, 1)
        self.assertEqual(result, 42)

        call = mocked_create_table.call_args
        kwargs["meta"] = {"imported_from": "dicts"}
        kwargs["samples"] = None
        self.assertEqual(call[1], kwargs)
def pdf_to_csv(input_filename, output_filename):
    total_pages = rows.plugins.pdf.number_of_pages(input_filename)
    pdf = rows.plugins.pdf.PyMuPDFBackend(input_filename)
    result = []
    for page_number in range(1, total_pages + 1):
        page = list(next(pdf.objects(page_numbers=(page_number, ))))
        data = list(rows.plugins.utils.ipartition(page, 4))
        header = [obj.text for obj in data[0]]
        for row in data[1:]:
            row = dict(zip(header, [obj.text for obj in row]))
            row["codigo_ibge"] = row.pop("IBGE")
            row["perfil"] = row.pop("Perfil Município")
            result.append(row)
    table = rows.import_from_dicts(result)
    rows.export_to_csv(table, output_filename)
Ejemplo n.º 19
0
def parse_file(filename):
    """Parse Amazonas' PDF file containing state employee information"""

    total_pages = rows.plugins.pdf.number_of_pages(filename)
    result = []
    for page in range(1, total_pages + 1):
        table = rows.import_from_pdf(
            filename,
            page_numbers=(page, ),
            starts_after="NOME",
            fields=PDF_FIELD_TYPES,
            skip_header=True,
        )
        for row in table:
            result.append(convert_row(row))

    return rows.import_from_dicts(result)
Ejemplo n.º 20
0
def convert(state, input_filename, output_filename):
    table = rows.import_from_csv(
        input_filename,
        force_types={
            "confirmed": rows.fields.IntegerField,
            "deaths": rows.fields.IntegerField,
        },
    )
    state_cities = ["TOTAL NO ESTADO", "Importados/Indefinidos"] + sorted(
        row.municipio for row in cities if row.uf == state
    )
    confirmed, deaths, dates = {}, {}, []
    for row in table:
        row_confirmed = row.confirmed or 0
        row_date = row.date
        row_deaths = row.deaths or 0
        row_name = row.city if row.place_type == "city" else "TOTAL NO ESTADO"

        if row_name not in state_cities:
            print(f"ERRO: município {repr(row_name)} não encontrado.")
            continue
        if row_confirmed == 0 and row_deaths == 0:
            # No data for this city in this day
            continue
        if row_date not in confirmed:
            confirmed[row_date] = {}
        if row_date not in deaths:
            deaths[row_date] = {}
        if row_name in confirmed[row_date] or row_name in deaths[row_date]:
            print(f"ERRO: conflito em {repr(row_name)} para {row_date}.")
            continue

        confirmed[row_date][row_name] = row_confirmed
        deaths[row_date][row_name] = row_deaths

    result = []
    dates = sorted(confirmed.keys(), reverse=True)
    for city in state_cities:
        row = {"municipio": city}
        for date in dates:
            date_str = f"{date.day:02d}_{date.month:02d}"
            row[f"confirmados_{date_str}"] = confirmed[date].get(city, None)
            row[f"mortes_{date_str}"] = deaths[date].get(city, None)
        result.append(row)
    rows.export_to_csv(rows.import_from_dicts(result), output_filename)
Ejemplo n.º 21
0
 def parse_boletim(self, state, data):
     self.logger.info(f"Parsing {state} boletim")
     try:
         reports = rows.import_from_dicts(
             data,
             force_types={
                 "date": rows.fields.DateField,
                 "notes": rows.fields.TextField,
                 "state": rows.fields.TextField,
                 "url": rows.fields.TextField,
             },
         )
     except Exception as exp:
         self.errors[state].append(("boletim", state, f"{exp.__class__.__name__}: {exp}"))
         return
     for report in reports:
         report = report._asdict()
         self.logger.debug(report)
         self.boletim_writer.writerow(report)
Ejemplo n.º 22
0
def main():
    csv_filename = pathlib.Path('data/names.csv')
    output = pathlib.Path('output/names-stats.csv')
    if not csv_filename.parent.exists():
        os.makedirs(str(csv_filename.parent.absolute()), exist_ok=True)
    if not output.parent.exists():
        os.makedirs(str(output.parent.absolute()), exist_ok=True)
    requests_cache.install_cache('nomes-ibge')

    result = []
    for name in unique_names(csv_filename):
        print(name)
        row = download_name_stats(name)
        if row is None:
            continue
        row['alternative_names'] = '|'.join(row['alternative_names'])
        result.append(row)
    table = rows.import_from_dicts(result)
    table.order_by('name')
    rows.utils.export_to_uri(table, str(output.absolute()))
    def test_export_to_dicts(self):
        table = rows.import_from_dicts(self.data)
        result = rows.export_to_dicts(table)
        full_data = [
                {'name': 'Álvaro',
                 'ids': '123',
                 'number': 3,
                 'other': None, },
                {'name': 'Test',
                 'ids': '456',
                 'number': None,
                 'other': None, },
                {'name': 'Python',
                 'ids': '123, 456',
                 'number': None,
                 'other': 3.14, },]

        self.assertEqual(len(result), len(table))
        for expected, actual in zip(full_data, result):
            self.assertDictEqual(expected, actual)
Ejemplo n.º 24
0
    def test_export_to_dicts(self):
        table = rows.import_from_dicts(self.data)
        result = rows.export_to_dicts(table)
        full_data = [
                {'name': 'Álvaro',
                 'ids': '123',
                 'number': 3,
                 'other': None, },
                {'name': 'Test',
                 'ids': '456',
                 'number': None,
                 'other': None, },
                {'name': 'Python',
                 'ids': '123, 456',
                 'number': None,
                 'other': 3.14, },]

        self.assertEqual(len(result), len(table))
        for expected, actual in zip(full_data, result):
            self.assertDictEqual(expected, actual)
Ejemplo n.º 25
0
def csv2sqlite(
    input_filename,
    output_filename,
    table_name,
    samples=30000,
    batch_size=10000,
    encoding="utf-8",
    callback=None,
    force_types=None,
):

    # Identify data types
    fobj = open_compressed(input_filename, encoding)
    reader = csv.reader(fobj)
    header = next(reader)
    data = []
    for index, row in enumerate(reader):
        row = dict(zip(header, row))
        if index == samples:
            break
        data.append(row)
    fields = rows.import_from_dicts(data, import_fields=header).fields
    if force_types is not None:
        fields.update(force_types)

    # Create lazy table object to be converted
    table = rows.Table(fields=fields)
    reader = csv.reader(open_compressed(input_filename, encoding))
    next(reader)  # skip header
    table._rows = reader

    # Export to SQLite
    rows.export_to_sqlite(table,
                          output_filename,
                          table_name=table_name,
                          callback=callback,
                          batch_size=batch_size)
Ejemplo n.º 26
0
    def parse_state_file(self, response):
        state = response.meta["state"]
        if response.status >= 400:
            self.errors[state].append(
                ("connection", state, f"HTTP status code: {response.status}")
            )
        else:
            response_data = json.load(io.BytesIO(response.body))
            try:
                self.parse_boletim(state, response_data["reports"])
            except Exception as exp:
                self.errors[state].append(
                    ("boletim", state, f"{exp.__class__.__name__}: {exp}")
                )
            try:
                self.parse_caso(state, response_data["cases"])
            except Exception as exp:
                self.errors[state].append(
                    ("caso", state, f"{exp.__class__.__name__}: {exp}")
                )

        if self.errors[state]:
            error_counter = Counter(error[0] for error in self.errors[state])
            error_counter_str = ", ".join(
                f"{error_type}: {count}" for error_type, count in error_counter.items()
            )
            self.logger.error(
                f"{len(self.errors[state])} errors found when parsing {state} ({error_counter_str})"
            )
            error_header = ("sheet", "state", "message")
            errors = rows.import_from_dicts(
                [dict(zip(error_header, row)) for row in self.errors[state]]
            )
            filename = ERROR_PATH / f"errors-{state}.csv"
            if not filename.parent.exists():
                filename.parent.mkdir(parents=True)
            rows.export_to_csv(errors, filename)
Ejemplo n.º 27
0
    def test_import_from_dicts_return_desired_data(self):
        table = rows.import_from_dicts(self.data)

        self.assertEqual(len(table), 3)
        self.assertEqual(len(table.fields), 4)
        self.assertEqual(set(table.field_names),
                         set(["ids", "name", "number", "other"]))
        self.assertEqual(table.fields["name"], rows.fields.TextField)
        self.assertEqual(table.fields["ids"], rows.fields.TextField)
        self.assertEqual(table.fields["number"], rows.fields.IntegerField)
        self.assertEqual(table.fields["other"], rows.fields.FloatField)

        self.assertEqual(table[0].name, "Álvaro")
        self.assertEqual(table[0].ids, "123")
        self.assertEqual(table[0].number, 3)
        self.assertEqual(table[0].other, None)
        self.assertEqual(table[1].name, "Test")
        self.assertEqual(table[1].ids, "456")
        self.assertEqual(table[1].number, None)
        self.assertEqual(table[1].other, None)
        self.assertEqual(table[2].name, "Python")
        self.assertEqual(table[2].ids, "123, 456")
        self.assertEqual(table[2].number, None)
        self.assertEqual(table[2].other, 3.14)
Ejemplo n.º 28
0
    def test_import_from_dicts_return_desired_data(self):
        table = rows.import_from_dicts(self.data)

        self.assertEqual(len(table), 3)
        self.assertEqual(len(table.fields), 4)
        self.assertEqual(set(table.field_names),
                         set(['ids', 'name', 'number', 'other']))
        self.assertEqual(table.fields['name'], rows.fields.TextField)
        self.assertEqual(table.fields['ids'], rows.fields.TextField)
        self.assertEqual(table.fields['number'], rows.fields.IntegerField)
        self.assertEqual(table.fields['other'], rows.fields.FloatField)

        self.assertEqual(table[0].name, 'Álvaro')
        self.assertEqual(table[0].ids, '123')
        self.assertEqual(table[0].number, 3)
        self.assertEqual(table[0].other, None)
        self.assertEqual(table[1].name, 'Test')
        self.assertEqual(table[1].ids, '456')
        self.assertEqual(table[1].number, None)
        self.assertEqual(table[1].other, None)
        self.assertEqual(table[2].name, 'Python')
        self.assertEqual(table[2].ids, '123, 456')
        self.assertEqual(table[2].number, None)
        self.assertEqual(table[2].other, 3.14)
Ejemplo n.º 29
0
    def test_import_from_dicts_return_desired_data(self):
        table = rows.import_from_dicts(self.data)

        self.assertEqual(len(table), 3)
        self.assertEqual(len(table.fields), 4)
        self.assertEqual(set(table.field_names),
                         set(['ids', 'name', 'number', 'other']))
        self.assertEqual(table.fields['name'], rows.fields.TextField)
        self.assertEqual(table.fields['ids'], rows.fields.TextField)
        self.assertEqual(table.fields['number'], rows.fields.IntegerField)
        self.assertEqual(table.fields['other'], rows.fields.FloatField)

        self.assertEqual(table[0].name, 'Álvaro')
        self.assertEqual(table[0].ids, '123')
        self.assertEqual(table[0].number, 3)
        self.assertEqual(table[0].other, None)
        self.assertEqual(table[1].name, 'Test')
        self.assertEqual(table[1].ids, '456')
        self.assertEqual(table[1].number, None)
        self.assertEqual(table[1].other, None)
        self.assertEqual(table[2].name, 'Python')
        self.assertEqual(table[2].ids, '123, 456')
        self.assertEqual(table[2].number, None)
        self.assertEqual(table[2].other, 3.14)
 def to_csv(self, path: Path) -> Path:
     data = (
         {"date": key, "value": self.data[key]} for key in sorted(self.data.keys())
     )
     export_to_csv(import_from_dicts(data), path)
     return path
Ejemplo n.º 31
0
def pgimport(
    filename,
    database_uri,
    table_name,
    encoding="utf-8",
    dialect=None,
    create_table=True,
    schema=None,
    callback=None,
    timeout=0.1,
    chunk_size=8388608,
    max_samples=10000,
):
    """Import data from CSV into PostgreSQL using the fastest method

    Required: psql command
    """

    fobj = open_compressed(filename, mode="r", encoding=encoding)
    sample = fobj.read(chunk_size)

    if dialect is None:  # Detect dialect
        dialect = rows.plugins.csv.discover_dialect(
            sample.encode(encoding), encoding=encoding
        )
    elif isinstance(dialect, six.text_type):
        dialect = csv.get_dialect(dialect)

    if schema is None:
        # Detect field names
        reader = csv.reader(io.StringIO(sample), dialect=dialect)
        field_names = [slug(field_name) for field_name in next(reader)]

    else:
        field_names = list(schema.keys())

    if create_table:
        if schema is None:
            data = [
                dict(zip(field_names, row))
                for row in itertools.islice(reader, max_samples)
            ]
            table = rows.import_from_dicts(data)
            field_types = [table.fields[field_name] for field_name in field_names]
        else:
            field_types = list(schema.values())

        columns = [
            "{} {}".format(name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE))
            for name, type_ in zip(field_names, field_types)
        ]
        create_table = SQL_CREATE_TABLE.format(
            table_name=table_name, field_types=", ".join(columns)
        )
        execute_command(get_psql_command(create_table, database_uri=database_uri))

    # Prepare the `psql` command to be executed based on collected metadata
    command = get_psql_copy_command(
        database_uri=database_uri,
        dialect=dialect,
        direction="FROM",
        encoding=encoding,
        header=field_names,
        table_name=table_name,
    )
    rows_imported, error = 0, None
    fobj = open_compressed(filename, mode="rb")
    try:
        process = subprocess.Popen(
            shlex.split(command),
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        data = fobj.read(chunk_size)
        total_written = 0
        while data != b"":
            written = process.stdin.write(data)
            total_written += written
            if callback:
                callback(written, total_written)
            data = fobj.read(chunk_size)
        stdout, stderr = process.communicate()
        if stderr != b"":
            raise RuntimeError(stderr.decode("utf-8"))
        rows_imported = int(stdout.replace(b"COPY ", b"").strip())

    except FileNotFoundError:
        raise RuntimeError("Command `psql` not found")

    except BrokenPipeError:
        raise RuntimeError(process.stderr.read().decode("utf-8"))

    return {"bytes_written": total_written, "rows_imported": rows_imported}
Ejemplo n.º 32
0
import argparse
import os

import rows
import rows.utils

from ba_parse_pdf import extract_table as ba_extract_table
from sc_parse_pdf import extract_table as sc_extract_table

extract_table_functions = {"BA": ba_extract_table, "SC": sc_extract_table}
parser = argparse.ArgumentParser()
parser.add_argument("state", choices=["BA", "SC"])
parser.add_argument("input_uri")
parser.add_argument("output_filename")
args = parser.parse_args()

input_uri, delete = args.input_uri, False
if input_uri.lower().startswith("http://") or input_uri.lower().startswith(
        "https://"):
    source = rows.utils.download_file(input_uri, progress=True, detect=False)
    input_uri, delete = source.uri, True

data = extract_table_functions[args.state](input_uri)
table = rows.import_from_dicts(data)
rows.export_to_csv(table, args.output_filename)
if delete:
    os.unlink(input_uri)
Ejemplo n.º 33
0
    if 'db.sqlite3' not in os.listdir(settings.BASE_DIR):
        os.system("""
rm -rf db.sqlite3 &&
touch db.sqlite3 &&
python manage.py makemigrations
python manage.py migrate
        """)

    arquivo = open('db.json', mode='r').read()

    data = json.loads(arquivo)
    for (k, v) in data.items():
        print('\n', k)

        if type(v).__name__  == 'list':
            data = rows.import_from_dicts(v)
            for row in data:
                if k == 'calendario_temporada':

                    try:
                        pais = Pais.objects.get(pais=row.localizacao['pais'])
                    except Pais.DoesNotExist:
                        pais = Pais.objects.create(
                            pais=row.localizacao['pais']
                        )

                    try:
                        cidade = Cidade.objects.get(cidade=row.localizacao['cidade'])
                    except Cidade.DoesNotExist:
                        cidade = Cidade.objects.create(
                            cidade=row.localizacao['cidade'],
Ejemplo n.º 34
0
def test_extract_table_3():
    expected = rows.import_from_csv(DATA_PATH / "expected_3.csv")
    result = rows.import_from_dicts(extract_table(DATA_PATH / "example_3.pdf"))
    assert_equal(result, expected)
Ejemplo n.º 35
0
def test_extract_table_2():
    expected = []
    result = rows.import_from_dicts(
        extract_table(DATA_PATH / "example_2.pdf")[: len(expected)]
    )
    assert_equal(result, expected)
Ejemplo n.º 36
0
import os
import rows

properties_of_interest = []

for result in os.listdir("results"):
    results = rows.import_from_csv("results/" + result)
    properties = [
        p._asdict() for p in results
        if p.for_rent and p.property_type == "casa" and p.rent_price <= 1500
        and "campinas" in p.city.lower()
    ]
    properties_of_interest.extend(properties)

table = rows.import_from_dicts(properties_of_interest)
rows.export_to_csv(table, "properties_of_interest_2.csv")
Ejemplo n.º 37
0
def convert_names(names, csv_output):
    table = rows.import_from_dicts(
        [name_ibge(name) for name in names if name_ibge(name)])
    rows.export_to_csv(table, csv_output)
Ejemplo n.º 38
0
def export(path: Path = DEFAULT_EXPORT_FILE) -> None:
    """Export all data to CSV."""
    table = import_from_dicts(data())
    export_to_csv(table, path)
Ejemplo n.º 39
0
 def test_export_callback(self):
     table = rows.import_from_dicts([{"id": number} for number in range(10)])
     myfunc = mock.Mock()
     rows.export_to_csv(table, callback=myfunc, batch_size=3)
     self.assertEqual(myfunc.call_count, 4)
     self.assertEqual([x[0][0] for x in myfunc.call_args_list], [3, 6, 9, 10])
Ejemplo n.º 40
0
def pgimport(filename,
             database_uri,
             table_name,
             encoding='utf-8',
             create_table=True,
             progress=False,
             timeout=0.1,
             chunk_size=8388608,
             max_samples=10000):
    """Import data from CSV into PostgreSQL using the fastest method

    Required: psql command
    """

    # Extract a sample from the CSV to detect its dialect and header
    fobj = open_compressed(filename, mode='r', encoding=encoding)
    sample = fobj.read(chunk_size).encode(encoding)
    dialect = rows.plugins.csv.discover_dialect(sample, encoding=encoding)
    reader = csv.reader(io.StringIO(sample.decode(encoding)))
    field_names = [slug(field_name) for field_name in next(reader)]

    if create_table:
        data = [
            dict(zip(field_names, row))
            for row in itertools.islice(reader, max_samples)
        ]
        table = rows.import_from_dicts(data)
        field_types = [table.fields[field_name] for field_name in field_names]
        columns = [
            '{} {}'.format(
                name, POSTGRESQL_TYPES.get(type_, DEFAULT_POSTGRESQL_TYPE))
            for name, type_ in zip(field_names, field_types)
        ]
        create_table = SQL_CREATE_TABLE.format(
            table_name=table_name,
            field_types=', '.join(columns),
        )
        execute_command(
            get_psql_command(create_table, database_uri=database_uri))

    # Prepare the `psql` command to be executed based on collected metadata
    command = get_psql_copy_command(
        database_uri=database_uri,
        table_name=table_name,
        header=field_names,
        dialect=dialect,
        encoding=encoding,
    )
    rows_imported, error, total_size = 0, None, None
    try:
        total_size = uncompressed_size(filename)
    except (RuntimeError, ValueError):
        pass

    if progress:
        progress_bar = tqdm(
            desc='Importing data',
            unit='bytes',
            unit_scale=True,
            unit_divisor=1024,
            total=total_size,
        )

    fobj = open_compressed(filename, mode='rb')
    try:
        process = subprocess.Popen(
            shlex.split(command),
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        data = fobj.read(chunk_size)
        while data != b'':
            data_written = process.stdin.write(data)
            if progress:
                progress_bar.update(data_written)
            data = fobj.read(chunk_size)
        stdout, stderr = process.communicate()
        if stderr != b'':
            raise RuntimeError(stderr.decode('utf-8'))
        rows_imported = int(stdout.replace(b'COPY ', b'').strip())

    except FileNotFoundError:
        raise RuntimeError('Command `psql` not found')

    except BrokenPipeError:
        raise RuntimeError(process.stderr.read().decode('utf-8'))

    if progress:
        progress_bar.close()

    return rows_imported
Ejemplo n.º 41
0
 def to_csv(self, path: Path) -> Path:
     """Export the adapter's data to a CSV file."""
     table = import_from_dicts(self.export())
     export_to_csv(table, path)
     return path
Ejemplo n.º 42
0
 def process_item(self, item, spider):
     row = import_from_dicts([dict(item)])
     logger.debug(row)
     export_to_sqlite(row, self.conn, self.table)
     return item