Exemple #1
0
def prepare_to_export(table, export_fields=None, *args, **kwargs):
    # TODO: optimize for more used cases (export_fields=None)
    table_type = type(table)
    if table_type not in (FlexibleTable, Table):
        raise ValueError("Table type not recognized")

    if export_fields is None:
        # we use already slugged-fieldnames
        export_fields = table.field_names
    else:
        # we need to slug all the field names
        export_fields = make_header(export_fields)

    table_field_names = table.field_names
    diff = set(export_fields) - set(table_field_names)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))

    yield export_fields

    if table_type is Table:
        field_indexes = list(map(table_field_names.index, export_fields))
        for row in table._rows:
            yield [row[field_index] for field_index in field_indexes]
    elif table_type is FlexibleTable:
        for row in table._rows:
            yield [row[field_name] for field_name in export_fields]
Exemple #2
0
def command_csv_to_sqlite(
    batch_size, samples, input_encoding, dialect, schemas, sources, output
):

    inputs = [pathlib.Path(filename) for filename in sources]
    output = pathlib.Path(output)
    table_names = make_header([filename.name.split(".")[0] for filename in inputs])
    schemas = _get_schemas_for_inputs(schemas, inputs)

    for filename, table_name, schema in zip(inputs, table_names, schemas):
        prefix = "[{filename} -> {db_filename}#{tablename}]".format(
            db_filename=output.name, tablename=table_name, filename=filename.name
        )
        pre_prefix = "{} (detecting data types)".format(prefix)
        progress = ProgressBar(prefix=prefix, pre_prefix=pre_prefix)
        csv_to_sqlite(
            six.text_type(filename),
            six.text_type(output),
            dialect=dialect,
            table_name=table_name,
            samples=samples,
            batch_size=batch_size,
            callback=progress.update,
            encoding=input_encoding,
            schema=schema,
        )
        progress.close()
Exemple #3
0
def prepare_to_export(table, export_fields=None, *args, **kwargs):
    # TODO: optimize for more used cases (export_fields=None)
    table_type = type(table)
    if table_type not in (FlexibleTable, Table):
        raise ValueError("Table type not recognized")

    if export_fields is None:
        # we use already slugged-fieldnames
        export_fields = table.field_names
    else:
        # we need to slug all the field names
        export_fields = make_header(export_fields)

    table_field_names = table.field_names
    diff = set(export_fields) - set(table_field_names)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))

    yield export_fields

    if table_type is Table:
        field_indexes = list(map(table_field_names.index, export_fields))
        for row in table._rows:
            yield [row[field_index] for field_index in field_indexes]
    elif table_type is FlexibleTable:
        for row in table._rows:
            yield [row[field_name] for field_name in export_fields]
Exemple #4
0
def _get_import_fields(fields, fields_exclude):
    if fields is not None and fields_exclude is not None:
        click.echo("ERROR: `--fields` cannot be used with `--fields-exclude`", err=True)
        sys.exit(20)
    elif fields is not None:
        return make_header(fields.split(","), permit_not=False)
    else:
        return None
Exemple #5
0
 def deserialize(self, row):
     field_names = list(row.keys())
     field_mapping = {
         old: self.fields[new]
         for old, new in zip(field_names, make_header(field_names))
     }
     return {
         key: field_mapping[key].deserialize(value) for key, value in row.items()
     }
Exemple #6
0
def join(
    input_encoding,
    output_encoding,
    input_locale,
    output_locale,
    verify_ssl,
    order_by,
    fields,
    fields_exclude,
    keys,
    sources,
    destination,
):

    export_fields = _get_import_fields(fields, fields_exclude)
    keys = make_header(keys.split(","), permit_not=False)

    if input_locale is not None:
        with rows.locale_context(input_locale):
            tables = [
                _import_table(source, encoding=input_encoding, verify_ssl=verify_ssl)
                for source in sources
            ]
    else:
        tables = [
            _import_table(source, encoding=input_encoding, verify_ssl=verify_ssl)
            for source in sources
        ]

    result = rows.join(keys, tables)
    if order_by is not None:
        order_by = _get_field_names(order_by, result.field_names, permit_not=True)
        # TODO: use complete list of `order_by` fields
        result.order_by(order_by[0].replace("^", "-"))

    if export_fields is None:
        export_fields = _get_export_fields(result.field_names, fields_exclude)
    # TODO: may use sys.stdout.encoding if output_file = '-'
    output_encoding = output_encoding or DEFAULT_OUTPUT_ENCODING
    if output_locale is not None:
        with rows.locale_context(output_locale):
            export_to_uri(
                result,
                destination,
                encoding=output_encoding,
                export_fields=export_fields,
            )
    else:
        export_to_uri(
            result, destination, encoding=output_encoding, export_fields=export_fields
        )
Exemple #7
0
def _get_field_names(field_names, table_field_names, permit_not=False):
    new_field_names = make_header(field_names.split(","), permit_not=permit_not)
    if not permit_not:
        diff = set(new_field_names) - set(table_field_names)
    else:
        diff = set(field_name.replace("^", "") for field_name in new_field_names) - set(
            table_field_names
        )

    if diff:
        missing = ", ".join(['"{}"'.format(field) for field in diff])
        click.echo("Table does not have fields: {}".format(missing), err=True)
        sys.exit(1)
    else:
        return new_field_names
Exemple #8
0
def create_table(data,
                 meta=None,
                 fields=None,
                 skip_header=True,
                 import_fields=None,
                 samples=None,
                 force_types=None,
                 *args,
                 **kwargs):
    """Create a rows.Table object based on data rows and some configurations

    - `skip_header` is only used if `fields` is set
    - `samples` is only used if `fields` is `None`. If samples=None, all data
      is filled in memory - use with caution.
    - `force_types` is only used if `fields` is `None`
    - `import_fields` can be used either if `fields` is set or not, the
      resulting fields will seek its order
    - `fields` must always be in the same order as the data
    """

    table_rows = iter(data)
    force_types = force_types or {}
    if import_fields is not None:
        import_fields = make_header(import_fields)

    if fields is None:  # autodetect field types
        # TODO: may add `type_hints` parameter so autodetection can be easier
        #       (plugins may specify some possible field types).
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
            table_rows = chain(sample_rows, table_rows)
        else:
            sample_rows = table_rows = list(table_rows)

        # Detect field types using only the desired columns
        detected_fields = detect_types(
            header,
            sample_rows,
            skip_indexes=[
                index for index, field in enumerate(header)
                if field in force_types or field not in (
                    import_fields or header)
            ],
            *args,
            **kwargs)
        # Check if any field was added during detecting process
        new_fields = [
            field_name for field_name in detected_fields.keys()
            if field_name not in header
        ]
        # Finally create the `fields` with both header and new field names,
        # based on detected fields `and force_types`
        fields = OrderedDict([(field_name,
                               detected_fields.get(field_name, TextField))
                              for field_name in header + new_fields])
        fields.update(force_types)

        # Update `header` and `import_fields` based on new `fields`
        header = list(fields.keys())
        if import_fields is None:
            import_fields = header

    else:  # using provided field types
        if not isinstance(fields, OrderedDict):
            raise ValueError("`fields` must be an `OrderedDict`")

        if skip_header:
            # If we're skipping the header probably this row is not trustable
            # (can be data or garbage).
            _ = next(table_rows)

        header = make_header(list(fields.keys()))
        if import_fields is None:
            import_fields = header

        fields = OrderedDict([(field_name, fields[key])
                              for field_name, key in zip(header, fields)])

    diff = set(import_fields) - set(header)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))
    fields = OrderedDict([(field_name, fields[field_name])
                          for field_name in import_fields])

    get_row = get_items(*map(header.index, import_fields))
    table = Table(fields=fields, meta=meta)
    table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows)

    return table
Exemple #9
0
def create_table(
    data,
    meta=None,
    fields=None,
    skip_header=True,
    import_fields=None,
    samples=None,
    force_types=None,
    *args,
    **kwargs
):
    """Create a rows.Table object based on data rows and some configurations

    - `skip_header` is only used if `fields` is set
    - `samples` is only used if `fields` is `None`. If samples=None, all data
      is filled in memory - use with caution.
    - `force_types` is only used if `fields` is `None`
    - `import_fields` can be used either if `fields` is set or not, the
      resulting fields will seek its order
    - `fields` must always be in the same order as the data
    """

    table_rows = iter(data)
    force_types = force_types or {}
    if import_fields is not None:
        import_fields = make_header(import_fields)

    if fields is None:  # autodetect field types
        # TODO: may add `type_hints` parameter so autodetection can be easier
        #       (plugins may specify some possible field types).
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
            table_rows = chain(sample_rows, table_rows)
        else:
            sample_rows = table_rows = list(table_rows)

        # Detect field types using only the desired columns
        detected_fields = detect_types(
            header,
            sample_rows,
            skip_indexes=[
                index
                for index, field in enumerate(header)
                if field in force_types or field not in (import_fields or header)
            ],
            *args,
            **kwargs
        )
        # Check if any field was added during detecting process
        new_fields = [
            field_name
            for field_name in detected_fields.keys()
            if field_name not in header
        ]
        # Finally create the `fields` with both header and new field names,
        # based on detected fields `and force_types`
        fields = OrderedDict(
            [
                (field_name, detected_fields.get(field_name, TextField))
                for field_name in header + new_fields
            ]
        )
        fields.update(force_types)

        # Update `header` and `import_fields` based on new `fields`
        header = list(fields.keys())
        if import_fields is None:
            import_fields = header

    else:  # using provided field types
        if not isinstance(fields, OrderedDict):
            raise ValueError("`fields` must be an `OrderedDict`")

        if skip_header:
            # If we're skipping the header probably this row is not trustable
            # (can be data or garbage).
            _ = next(table_rows)

        header = make_header(list(fields.keys()))
        if import_fields is None:
            import_fields = header

        fields = OrderedDict(
            [(field_name, fields[key]) for field_name, key in zip(header, fields)]
        )

    diff = set(import_fields) - set(header)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))
    fields = OrderedDict(
        [(field_name, fields[field_name]) for field_name in import_fields]
    )

    get_row = get_items(*map(header.index, import_fields))
    table = Table(fields=fields, meta=meta)
    table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows)

    return table