Beispiel #1
0
def command_pgimport(input_encoding, no_create_table, source, database_uri,
                     table_name):

    pgimport(
        filename=source,
        encoding=input_encoding,
        database_uri=database_uri,
        create_table=not no_create_table,
        table_name=table_name,
        progress=True,
    )
Beispiel #2
0
def main():
    for table in ("empresa", "socio", "cnae_secundaria"):
        pgimport(
            str(DATA_DIRECTORY / f"{table}.csv.gz"),
            POSTGRES_URI,
            table,
            schema=load_schema(str(SCHEMA_DIRECTORY / f"{table}.csv")),
        )
        create_index(table)

    with cnaes_csv() as (source, schema):
        pgimport(source, POSTGRES_URI, "cnae", schema=load_schema(schema))
        create_index("cnae", "codigo")
Beispiel #3
0
def command_pgimport(
    input_encoding, no_create_table, dialect, schema, source, database_uri, table_name
):

    progress = ProgressBar(
        prefix="Importing data", pre_prefix="Detecting file size", unit="bytes"
    )
    try:
        total_size = uncompressed_size(source)
    except (RuntimeError, ValueError):
        total_size = None
    else:
        progress.total = total_size
    progress.description = "Analyzing source file"
    schemas = _get_schemas_for_inputs([schema] if schema else None, [source])
    import_meta = pgimport(
        filename=source,
        encoding=input_encoding,
        dialect=dialect,
        database_uri=database_uri,
        create_table=not no_create_table,
        table_name=table_name,
        callback=progress.update,
        schema=schemas[0],
    )
    progress.description = "{} rows imported".format(import_meta["rows_imported"])
    progress.close()
Beispiel #4
0
def main():
    if wait_for_postgres() and tables_exist():
        print(("There are existing tables in the database. "
               "Please, start with a clean database."))
        return

    tables = ("empresa", "socio", "cnae_secundaria")
    files = ("empresa", "socio", "cnae-secundaria")
    for table, filename in zip(tables, files):
        pgimport(
            str(DATA_DIRECTORY / f"{filename}.csv.gz"),
            POSTGRES_URI,
            table,
            schema=load_schema(str(SCHEMA_DIRECTORY / f"{filename}.csv")),
        )
        create_index(table)

    with cnaes_csv() as (source, schema):
        pgimport(source, POSTGRES_URI, "cnae", schema=load_schema(schema))
        create_index("cnae", "codigo")
Beispiel #5
0
    def handle(self, *args, **kwargs):
        dataset_slug = kwargs["dataset_slug"]
        tablename = kwargs["tablename"]
        filename = kwargs["filename"]
        ask_confirmation = not kwargs["no_input"]
        import_data = not kwargs["no_import_data"]
        vacuum = not kwargs["no_vacuum"]
        clear_view_cache = not kwargs["no_clear_view_cache"]
        create_filter_indexes = not kwargs["no_create_filter_indexes"]
        fill_choices = not kwargs["no_fill_choices"]
        collect_date = self.clean_collect_date(kwargs["collect_date"])

        if ask_confirmation:
            print("This operation will DESTROY the existing data for this " "dataset table.")
            answer = input("Do you want to continue? (y/n) ")
            if answer.lower().strip() not in ("y", "yes"):
                exit()

        table = Table.objects.for_dataset(dataset_slug).named(tablename)
        Model = table.get_model()

        if import_data:
            # Create the table if not exists
            with transaction.atomic():
                try:
                    Model.delete_table()
                except ProgrammingError:  # Does not exist
                    pass
                finally:
                    Model.create_table(create_indexes=False)
                    Model.create_triggers()

            # Get file object, header and set command to run
            table_name = Model._meta.db_table
            database_uri = os.environ["DATABASE_URL"]
            encoding = "utf-8"  # TODO: receive as a parameter
            timeout = 0.1  # TODO: receive as a parameter
            start_time = time.time()
            progress = ProgressBar(prefix="Importing data", unit="bytes")

            # TODO: change the way we do it (CSV dialect may change, encoding
            # etc.)
            file_header = open_compressed(filename).readline().strip().split(",")
            table_schema = table.schema
            schema = OrderedDict([(field_name, table_schema[field_name]) for field_name in file_header])
            try:
                import_meta = pgimport(
                    filename=filename,
                    encoding=encoding,
                    dialect="excel",
                    database_uri=database_uri,
                    table_name=table_name,
                    create_table=False,
                    timeout=timeout,
                    callback=progress.update,
                    schema=schema,
                )
            except RuntimeError as exception:
                progress.close()
                print("ERROR: {}".format(exception.args[0]))
                exit(1)
            else:
                progress.close()
                table.import_date = timezone.now()
                table.save()
                if collect_date:
                    table.version.collected_at = collect_date
                    table.version.save()
                end_time = time.time()
                duration = end_time - start_time
                rows_imported = import_meta["rows_imported"]
                print(
                    "  done in {:7.3f}s ({} rows imported, {:.3f} rows/s).".format(
                        duration, rows_imported, rows_imported / duration
                    )
                )
            Model = table.get_model(cache=False)
            table.invalidate_cache()

        if vacuum:
            print("Running VACUUM ANALYSE...", end="", flush=True)
            start = time.time()
            Model.analyse_table()
            end = time.time()
            print("  done in {:.3f}s.".format(end - start))

        if create_filter_indexes:
            # TODO: warn if field has_choices but not in Table.filtering
            print("Creating filter indexes...", end="", flush=True)
            start = time.time()
            Model.create_indexes()
            end = time.time()
            print("  done in {:.3f}s.".format(end - start))

        if fill_choices:
            print("Filling choices...")
            start = time.time()
            choiceables = Field.objects.for_table(table).choiceables()
            for field in choiceables:
                print("  {}".format(field.name), end="", flush=True)
                start_field = time.time()
                field.update_choices()
                field.save()
                end_field = time.time()
                print(" - done in {:.3f}s.".format(end_field - start_field))
            end = time.time()
            print("  done in {:.3f}s.".format(end - start))

        if clear_view_cache:
            print("Clearing view cache...")
            cache.clear()
Beispiel #6
0
    def handle(self, *args, **kwargs):
        dataset_slug = kwargs['dataset_slug']
        tablename = kwargs['tablename']
        filename = kwargs['filename']
        ask_confirmation = not kwargs['no_input']
        import_data = not kwargs['no_import_data']
        vacuum = not kwargs['no_vacuum']
        create_filter_indexes = not kwargs['no_create_filter_indexes']
        fill_choices = not kwargs['no_fill_choices']

        if ask_confirmation:
            print(
                'This operation will DESTROY the existing data for this '
                'dataset table.'
            )
            answer = input('Do you want to continue? (y/n) ')
            if answer.lower().strip() not in ('y', 'yes'):
                exit()

        table = Table.objects.for_dataset(dataset_slug).named(tablename)
        Model = table.get_model()

        if import_data:
            # Create the table if not exists
            with transaction.atomic():
                try:
                    Model.delete_table()
                except ProgrammingError:  # Does not exist
                    pass
                finally:
                    Model.create_table(create_indexes=False)
                    Model.create_triggers()

            # Get file object, header and set command to run
            table_name = Model._meta.db_table
            database_uri = os.environ['DATABASE_URL']
            encoding = 'utf-8'  # TODO: receive as a parameter
            timeout = 0.1  # TODO: receive as a parameter
            start_time = time.time()
            progress = ProgressBar(prefix='Importing data', unit='bytes')
            try:
                import_meta = pgimport(
                    filename=filename,
                    encoding=encoding,
                    dialect='excel',
                    database_uri=database_uri,
                    table_name=table_name,
                    create_table=False,
                    timeout=timeout,
                    callback=progress.update,
                )
            except RuntimeError as exception:
                progress.close()
                print('ERROR: {}'.format(exception.args[0]))
                exit(1)
            else:
                progress.close()
                table.last_update = timezone.now()
                table.save()
                end_time = time.time()
                duration = end_time - start_time
                rows_imported = import_meta['rows_imported']
                print('  done in {:7.3f}s ({} rows imported, {:.3f} rows/s).'
                      .format(duration, rows_imported, rows_imported / duration))
            Model = table.get_model(cache=False)

        if vacuum:
            print('Running VACUUM ANALYSE...', end='', flush=True)
            start = time.time()
            Model.analyse_table()
            end = time.time()
            print('  done in {:.3f}s.'.format(end - start))

        if create_filter_indexes:
            # TODO: warn if field has_choices but not in Table.filtering
            print('Creating filter indexes...', end='', flush=True)
            start = time.time()
            Model.create_indexes()
            end = time.time()
            print('  done in {:.3f}s.'.format(end - start))

        if fill_choices:
            print('Filling choices...')
            start = time.time()
            choiceables = Field.objects.for_table(table).choiceables()
            for field in choiceables:
                print('  {}'.format(field.name), end='', flush=True)
                start_field = time.time()
                field.update_choices()
                field.save()
                end_field = time.time()
                print(' - done in {:.3f}s.'.format(end_field - start_field))
            end = time.time()
            print('  done in {:.3f}s.'.format(end - start))