Exemple #1
0
def join(keys, tables):
    """Merge a list of `Table` objects using `keys` to group rows"""

    # Make new (merged) Table fields
    fields = OrderedDict()
    for table in tables:
        fields.update(table.fields)
    # TODO: may raise an error if a same field is different in some tables

    # Check if all keys are inside merged Table's fields
    fields_keys = set(fields.keys())
    for key in keys:
        if key not in fields_keys:
            raise ValueError('Invalid key: "{}"'.format(key))

    # Group rows by key, without missing ordering
    none_fields = lambda: OrderedDict({field: None for field in fields.keys()})
    data = OrderedDict()
    for table in tables:
        for row in table:
            row_key = tuple([getattr(row, key) for key in keys])
            if row_key not in data:
                data[row_key] = none_fields()
            data[row_key].update(row._asdict())

    merged = Table(fields=fields)
    merged.extend(data.values())
    return merged
Exemple #2
0
def join(keys, tables):
    """Merge a list of `Table` objects using `keys` to group rows"""

    # Make new (merged) Table fields
    fields = OrderedDict()
    for table in tables:
        fields.update(table.fields)
    # TODO: may raise an error if a same field is different in some tables

    # Check if all keys are inside merged Table's fields
    fields_keys = set(fields.keys())
    for key in keys:
        if key not in fields_keys:
            raise ValueError('Invalid key: "{}"'.format(key))

    # Group rows by key, without missing ordering
    none_fields = lambda: OrderedDict({field: None for field in fields.keys()})
    data = OrderedDict()
    for table in tables:
        for row in table:
            row_key = tuple([getattr(row, key) for key in keys])
            if row_key not in data:
                data[row_key] = none_fields()
            data[row_key].update(row._asdict())

    merged = Table(fields=fields)
    merged.extend(data.values())
    return merged
Exemple #3
0
def create_table(data,
                 meta=None,
                 fields=None,
                 skip_header=True,
                 import_fields=None,
                 samples=None,
                 force_types=None,
                 *args,
                 **kwargs):
    """Create a rows.Table object based on data rows and some configurations

    - `skip_header` is only used if `fields` is set
    - `samples` is only used if `fields` is `None`. If samples=None, all data
      is filled in memory - use with caution.
    - `force_types` is only used if `fields` is `None`
    - `import_fields` can be used either if `fields` is set or not, the
      resulting fields will seek its order
    - `fields` must always be in the same order as the data
    """

    table_rows = iter(data)
    force_types = force_types or {}
    if import_fields is not None:
        import_fields = make_header(import_fields)

    if fields is None:  # autodetect field types
        # TODO: may add `type_hints` parameter so autodetection can be easier
        #       (plugins may specify some possible field types).
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
            table_rows = chain(sample_rows, table_rows)
        else:
            sample_rows = table_rows = list(table_rows)

        # Detect field types using only the desired columns
        detected_fields = detect_types(
            header,
            sample_rows,
            skip_indexes=[
                index for index, field in enumerate(header)
                if field in force_types or field not in (
                    import_fields or header)
            ],
            *args,
            **kwargs)
        # Check if any field was added during detecting process
        new_fields = [
            field_name for field_name in detected_fields.keys()
            if field_name not in header
        ]
        # Finally create the `fields` with both header and new field names,
        # based on detected fields `and force_types`
        fields = OrderedDict([(field_name,
                               detected_fields.get(field_name, TextField))
                              for field_name in header + new_fields])
        fields.update(force_types)

        # Update `header` and `import_fields` based on new `fields`
        header = list(fields.keys())
        if import_fields is None:
            import_fields = header

    else:  # using provided field types
        if not isinstance(fields, OrderedDict):
            raise ValueError("`fields` must be an `OrderedDict`")

        if skip_header:
            # If we're skipping the header probably this row is not trustable
            # (can be data or garbage).
            _ = next(table_rows)

        header = make_header(list(fields.keys()))
        if import_fields is None:
            import_fields = header

        fields = OrderedDict([(field_name, fields[key])
                              for field_name, key in zip(header, fields)])

    diff = set(import_fields) - set(header)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))
    fields = OrderedDict([(field_name, fields[field_name])
                          for field_name in import_fields])

    get_row = get_items(*map(header.index, import_fields))
    table = Table(fields=fields, meta=meta)
    table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows)

    return table
Exemple #4
0
def create_table(
    data,
    meta=None,
    fields=None,
    skip_header=True,
    import_fields=None,
    samples=None,
    force_types=None,
    *args,
    **kwargs
):
    """Create a rows.Table object based on data rows and some configurations

    - `skip_header` is only used if `fields` is set
    - `samples` is only used if `fields` is `None`. If samples=None, all data
      is filled in memory - use with caution.
    - `force_types` is only used if `fields` is `None`
    - `import_fields` can be used either if `fields` is set or not, the
      resulting fields will seek its order
    - `fields` must always be in the same order as the data
    """

    table_rows = iter(data)
    force_types = force_types or {}
    if import_fields is not None:
        import_fields = make_header(import_fields)

    if fields is None:  # autodetect field types
        # TODO: may add `type_hints` parameter so autodetection can be easier
        #       (plugins may specify some possible field types).
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
            table_rows = chain(sample_rows, table_rows)
        else:
            sample_rows = table_rows = list(table_rows)

        # Detect field types using only the desired columns
        detected_fields = detect_types(
            header,
            sample_rows,
            skip_indexes=[
                index
                for index, field in enumerate(header)
                if field in force_types or field not in (import_fields or header)
            ],
            *args,
            **kwargs
        )
        # Check if any field was added during detecting process
        new_fields = [
            field_name
            for field_name in detected_fields.keys()
            if field_name not in header
        ]
        # Finally create the `fields` with both header and new field names,
        # based on detected fields `and force_types`
        fields = OrderedDict(
            [
                (field_name, detected_fields.get(field_name, TextField))
                for field_name in header + new_fields
            ]
        )
        fields.update(force_types)

        # Update `header` and `import_fields` based on new `fields`
        header = list(fields.keys())
        if import_fields is None:
            import_fields = header

    else:  # using provided field types
        if not isinstance(fields, OrderedDict):
            raise ValueError("`fields` must be an `OrderedDict`")

        if skip_header:
            # If we're skipping the header probably this row is not trustable
            # (can be data or garbage).
            _ = next(table_rows)

        header = make_header(list(fields.keys()))
        if import_fields is None:
            import_fields = header

        fields = OrderedDict(
            [(field_name, fields[key]) for field_name, key in zip(header, fields)]
        )

    diff = set(import_fields) - set(header)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))
    fields = OrderedDict(
        [(field_name, fields[field_name]) for field_name in import_fields]
    )

    get_row = get_items(*map(header.index, import_fields))
    table = Table(fields=fields, meta=meta)
    table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows)

    return table