Esempio n. 1
0
def create_table(data, meta=None, force_headers=None, fields=None,
                 skip_header=True, *args, **kwargs):
    # TODO: add auto_detect_types=True parameter
    table_rows = list(data)

    if fields is None:
        if force_headers is None:
            header = make_header(table_rows[0])
            table_rows = table_rows[1:]
        else:
            header = force_headers
        fields = detect_types(header, table_rows, *args, **kwargs)
    else:
        if skip_header:
            table_rows = table_rows[1:]
            header = make_header(fields.keys())
            assert type(fields) is collections.OrderedDict
            fields = {field_name: fields[key]
                      for field_name, key in zip(header, fields)}
        else:
            header = make_header(fields.keys())

        # TODO: may reuse max_columns from html
        max_columns = max(len(row) for row in table_rows)
        assert len(fields) == max_columns

    # TODO: put this inside Table.__init__
    table = Table(fields=fields, meta=meta)
    for row in table_rows:
        table.append({field_name: value
                      for field_name, value in zip(header, row)})

    return table
Esempio n. 2
0
    def test_detect_types_binary(self):

        # first, try values as (`bytes`/`str`)
        expected = {key: fields.BinaryField for key in self.expected.keys()}
        values = [[value.encode('utf-8') for value in row]
                  for row in self.data]
        result = fields.detect_types(self.fields, values)
        self.assertDictEqual(dict(result), expected)

        # second, try base64-encoded values (as `str`/`unicode`)
        expected = {key: fields.TextField for key in self.expected.keys()}
        values = [[b64encode(value.encode('utf-8')).decode('ascii')
                   for value in row]
                  for row in self.data]
        result = fields.detect_types(self.fields, values)
        self.assertDictEqual(dict(result), expected)
Esempio n. 3
0
    def __setitem__(self, key, value):
        key_type = type(key)
        if key_type == int:
            self._rows[key] = self._make_row(value)
        elif key_type is six.text_type:
            from rows import fields
            from rows.plugins import utils

            values = list(value)  # I'm not lazy, sorry
            if len(values) != len(self):
                raise ValueError(
                    "Values length ({}) should be the same as "
                    "Table length ({})".format(len(values), len(self))
                )

            field_name = utils.slug(key)
            is_new_field = field_name not in self.field_names
            field_type = fields.detect_types(
                [field_name], [[value] for value in values]
            )[field_name]
            self.fields[field_name] = field_type
            self.Row = namedtuple("Row", self.field_names)

            if is_new_field:
                for row, value in zip(self._rows, values):
                    row.append(field_type.deserialize(value))
            else:
                field_index = self.field_names.index(field_name)
                for row, value in zip(self._rows, values):
                    row[field_index] = field_type.deserialize(value)
        else:
            raise ValueError("Unsupported key type: {}".format(type(key).__name__))
Esempio n. 4
0
    def __setitem__(self, key, value):
        key_type = type(key)
        if key_type == int:
            self._rows[key] = self._make_row(value)
        elif key_type == unicode:  # TODO: change to 'str' on Python3
            values = list(value)  # I'm not lazy, sorry
            if len(values) != len(self):
                raise ValueError('Values length ({}) should be the same as '
                                 'Table length ({})'
                                 .format(len(values), len(self)))

            from rows.fields import detect_types
            from rows.utils import slug

            field_name = slug(key)
            is_new_field = field_name not in self.field_names
            field_type = detect_types([field_name],
                    [[value] for value in values])[field_name]
            self.fields[field_name] = field_type
            self.Row = namedtuple('Row', self.field_names)

            if is_new_field:
                for row, value in zip(self._rows, values):
                    row.append(field_type.deserialize(value))
            else:
                field_index = self.field_names.index(field_name)
                for row, value in zip(self._rows, values):
                    row[field_index] = field_type.deserialize(value)
        else:
            raise ValueError('Unsupported key type: {}'
                    .format(type(key).__name__))
Esempio n. 5
0
    def __setitem__(self, key, value):
        key_type = type(key)
        if key_type == int:
            self._rows[key] = self._make_row(value)
        elif key_type is six.text_type:
            from rows import fields
            from rows.plugins import utils

            values = list(value)  # I'm not lazy, sorry
            if len(values) != len(self):
                raise ValueError('Values length ({}) should be the same as '
                                 'Table length ({})'.format(
                                     len(values), len(self)))

            field_name = utils.slug(key)
            is_new_field = field_name not in self.field_names
            field_type = fields.detect_types([field_name],
                                             [[value]
                                              for value in values])[field_name]
            self.fields[field_name] = field_type
            self.Row = namedtuple('Row', self.field_names)

            if is_new_field:
                for row, value in zip(self._rows, values):
                    row.append(field_type.deserialize(value))
            else:
                field_index = self.field_names.index(field_name)
                for row, value in zip(self._rows, values):
                    row[field_index] = field_type.deserialize(value)
        else:
            raise ValueError('Unsupported key type: {}'.format(
                type(key).__name__))
Esempio n. 6
0
 def test_precedence(self):
     field_types = [
             ('bool', fields.BoolField),
             ('integer', fields.IntegerField),
             ('float', fields.FloatField),
             ('datetime', fields.DatetimeField),
             ('date', fields.DateField),
             ('float', fields.FloatField),
             ('percent', fields.PercentField),
             ('json', fields.JSONField),
             ('email', fields.EmailField),
             ('binary1', fields.BinaryField),
             ('binary2', fields.BinaryField),
             ('text', fields.TextField),
         ]
     data = [
             [
                 'false',
                 '42',
                 '3.14',
                 '2016-08-15T05:21:10',
                 '2016-08-15',
                 '2.71',
                 '76.38%',
                 '{"key": "value"}',
                 '*****@*****.**',
                 b'cHl0aG9uIHJ1bGVz',
                 b'python rules',
                 'Álvaro Justen'
             ]
         ]
     result = fields.detect_types([item[0] for item in field_types], data)
     self.assertDictEqual(dict(result), dict(field_types))
Esempio n. 7
0
 def test_detect_types_utf8(self):
     result = fields.detect_types(
         self.fields,
         self.data,
         encoding='utf-8'
     )
     self.assertEqual(type(result), collections.OrderedDict)
     self.assertEqual(result.keys(), self.fields)
     self.assertDictEqual(dict(result), self.expected)
Esempio n. 8
0
    def test_detect_types_binary(self):

        # first, try values as (`bytes`/`str`)
        expected = {key: fields.BinaryField for key in self.expected.keys()}
        values = [
            [b"some binary data" for _ in range(len(self.data[0]))] for __ in range(20)
        ]
        result = fields.detect_types(self.fields, values)
        self.assertDictEqual(dict(result), expected)

        # second, try base64-encoded values (as `str`/`unicode`)
        expected = {key: fields.TextField for key in self.expected.keys()}
        values = [
            [b64encode(value.encode("utf-8")).decode("ascii") for value in row]
            for row in self.data
        ]
        result = fields.detect_types(self.fields, values)
        self.assertDictEqual(dict(result), expected)
Esempio n. 9
0
def create_table(data, meta=None, fields=None, skip_header=True,
                 import_fields=None, samples=None, force_types=None,
                 *args, **kwargs):
    # TODO: add auto_detect_types=True parameter
    table_rows = iter(data)
    sample_rows = []

    if fields is None:
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
        else:
            sample_rows = list(table_rows)

        fields = detect_types(header, sample_rows, *args, **kwargs)

        if force_types is not None:
            # TODO: optimize field detection (ignore fields on `force_types`)
            for field_name, field_type in force_types.items():
                fields[field_name] = field_type
    else:
        if not isinstance(fields, OrderedDict):
            raise ValueError('`fields` must be an `OrderedDict`')

        if skip_header:
            _ = next(table_rows)

        header = make_header(list(fields.keys()))
        fields = OrderedDict([(field_name, fields[key])
                              for field_name, key in zip(header, fields)])

    if import_fields is not None:
        # TODO: can optimize if import_fields is not None.
        #       Example: do not detect all columns
        import_fields = make_header(import_fields)

        diff = set(import_fields) - set(header)
        if diff:
            field_names = ', '.join('"{}"'.format(field) for field in diff)
            raise ValueError("Invalid field names: {}".format(field_names))

        new_fields = OrderedDict()
        for field_name in import_fields:
            new_fields[field_name] = fields[field_name]
        fields = new_fields

    table = Table(fields=fields, meta=meta)
    # TODO: put this inside Table.__init__
    for row in chain(sample_rows, table_rows):
        table.append({field_name: value
                      for field_name, value in zip(header, row)})

    return table
Esempio n. 10
0
 def test_precedence(self):
     field_types = [
         ("bool", fields.BoolField),
         ("integer", fields.IntegerField),
         ("float", fields.FloatField),
         ("datetime", fields.DatetimeField),
         ("date", fields.DateField),
         ("float", fields.FloatField),
         ("percent", fields.PercentField),
         ("json", fields.JSONField),
         ("email", fields.EmailField),
         ("binary1", fields.BinaryField),
         ("binary2", fields.BinaryField),
         ("text", fields.TextField),
     ]
     data = [
         [
             "false",
             "42",
             "3.14",
             "2016-08-15T05:21:10",
             "2016-08-15",
             "2.71",
             "76.38%",
             '{"key": "value"}',
             "*****@*****.**",
             b"cHl0aG9uIHJ1bGVz",
             b"python rules",
             "Álvaro Justen",
         ]
     ]
     result = fields.detect_types(
         [item[0] for item in field_types],
         data,
         field_types=[item[1] for item in field_types],
     )
     self.assertDictEqual(dict(result), dict(field_types))
Esempio n. 11
0
 def test_detect_types(self):
     result = fields.detect_types(self.fields, self.data)
     self.assertDictEqual(dict(result), self.expected)
Esempio n. 12
0
 def test_detect_types_no_sample(self):
     expected = {key: fields.BinaryField for key in self.expected.keys()}
     result = fields.detect_types(self.fields, [])
     self.assertDictEqual(dict(result), expected)
Esempio n. 13
0
def create_table(data,
                 meta=None,
                 fields=None,
                 skip_header=True,
                 import_fields=None,
                 samples=None,
                 force_types=None,
                 *args,
                 **kwargs):
    # TODO: add auto_detect_types=True parameter
    table_rows = iter(data)
    sample_rows = []

    if fields is None:
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
        else:
            sample_rows = list(table_rows)

        fields = detect_types(header, sample_rows, *args, **kwargs)

        if force_types is not None:
            # TODO: optimize field detection (ignore fields on `force_types`)
            for field_name, field_type in force_types.items():
                fields[field_name] = field_type
    else:
        if not isinstance(fields, OrderedDict):
            raise ValueError('`fields` must be an `OrderedDict`')

        if skip_header:
            next(table_rows)

        header = make_header(list(fields.keys()))
        fields = OrderedDict([(field_name, fields[key])
                              for field_name, key in zip(header, fields)])

    if import_fields is not None:
        # TODO: can optimize if import_fields is not None.
        #       Example: do not detect all columns
        import_fields = make_header(import_fields)

        diff = set(import_fields) - set(header)
        if diff:
            field_names = ', '.join('"{}"'.format(field) for field in diff)
            raise ValueError("Invalid field names: {}".format(field_names))

        new_fields = OrderedDict()
        for field_name in import_fields:
            new_fields[field_name] = fields[field_name]
        fields = new_fields

    table = Table(fields=fields, meta=meta)
    # TODO: put this inside Table.__init__
    for row in chain(sample_rows, table_rows):
        table.append(
            {field_name: value
             for field_name, value in zip(header, row)})

    return table
Esempio n. 14
0
 def test_detect_types_unicode(self):
     data = [[field.decode('utf-8') for field in row] for row in self.data]
     result = fields.detect_types(self.fields, data)
     self.assertDictEqual(dict(result), self.expected)
Esempio n. 15
0
 def test_detect_types_unicode(self):
     data = [[field.decode('utf-8') for field in row] for row in self.data]
     result = fields.detect_types(self.fields, data)
     self.assertDictEqual(dict(result), self.expected)
Esempio n. 16
0
 def test_detect_types_different_number_of_fields(self):
     result = fields.detect_types(["f1", "f2"], [["a", "b", "c"]])
     self.assertEquals(list(result.keys()), ["f1", "f2", "field_2"])
Esempio n. 17
0
def create_table(data,
                 meta=None,
                 fields=None,
                 skip_header=True,
                 import_fields=None,
                 samples=None,
                 force_types=None,
                 *args,
                 **kwargs):
    """Create a rows.Table object based on data rows and some configurations

    - `skip_header` is only used if `fields` is set
    - `samples` is only used if `fields` is `None`. If samples=None, all data
      is filled in memory - use with caution.
    - `force_types` is only used if `fields` is `None`
    - `import_fields` can be used either if `fields` is set or not, the
      resulting fields will seek its order
    - `fields` must always be in the same order as the data
    """

    table_rows = iter(data)
    force_types = force_types or {}
    if import_fields is not None:
        import_fields = make_header(import_fields)

    if fields is None:  # autodetect field types
        # TODO: may add `type_hints` parameter so autodetection can be easier
        #       (plugins may specify some possible field types).
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
            table_rows = chain(sample_rows, table_rows)
        else:
            sample_rows = table_rows = list(table_rows)

        # Detect field types using only the desired columns
        detected_fields = detect_types(
            header,
            sample_rows,
            skip_indexes=[
                index for index, field in enumerate(header)
                if field in force_types or field not in (
                    import_fields or header)
            ],
            *args,
            **kwargs)
        # Check if any field was added during detecting process
        new_fields = [
            field_name for field_name in detected_fields.keys()
            if field_name not in header
        ]
        # Finally create the `fields` with both header and new field names,
        # based on detected fields `and force_types`
        fields = OrderedDict([(field_name,
                               detected_fields.get(field_name, TextField))
                              for field_name in header + new_fields])
        fields.update(force_types)

        # Update `header` and `import_fields` based on new `fields`
        header = list(fields.keys())
        if import_fields is None:
            import_fields = header

    else:  # using provided field types
        if not isinstance(fields, OrderedDict):
            raise ValueError("`fields` must be an `OrderedDict`")

        if skip_header:
            # If we're skipping the header probably this row is not trustable
            # (can be data or garbage).
            _ = next(table_rows)

        header = make_header(list(fields.keys()))
        if import_fields is None:
            import_fields = header

        fields = OrderedDict([(field_name, fields[key])
                              for field_name, key in zip(header, fields)])

    diff = set(import_fields) - set(header)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))
    fields = OrderedDict([(field_name, fields[field_name])
                          for field_name in import_fields])

    get_row = get_items(*map(header.index, import_fields))
    table = Table(fields=fields, meta=meta)
    table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows)

    return table
Esempio n. 18
0
def create_table(
    data,
    meta=None,
    fields=None,
    skip_header=True,
    import_fields=None,
    samples=None,
    force_types=None,
    *args,
    **kwargs
):
    """Create a rows.Table object based on data rows and some configurations

    - `skip_header` is only used if `fields` is set
    - `samples` is only used if `fields` is `None`. If samples=None, all data
      is filled in memory - use with caution.
    - `force_types` is only used if `fields` is `None`
    - `import_fields` can be used either if `fields` is set or not, the
      resulting fields will seek its order
    - `fields` must always be in the same order as the data
    """

    table_rows = iter(data)
    force_types = force_types or {}
    if import_fields is not None:
        import_fields = make_header(import_fields)

    if fields is None:  # autodetect field types
        # TODO: may add `type_hints` parameter so autodetection can be easier
        #       (plugins may specify some possible field types).
        header = make_header(next(table_rows))

        if samples is not None:
            sample_rows = list(islice(table_rows, 0, samples))
            table_rows = chain(sample_rows, table_rows)
        else:
            sample_rows = table_rows = list(table_rows)

        # Detect field types using only the desired columns
        detected_fields = detect_types(
            header,
            sample_rows,
            skip_indexes=[
                index
                for index, field in enumerate(header)
                if field in force_types or field not in (import_fields or header)
            ],
            *args,
            **kwargs
        )
        # Check if any field was added during detecting process
        new_fields = [
            field_name
            for field_name in detected_fields.keys()
            if field_name not in header
        ]
        # Finally create the `fields` with both header and new field names,
        # based on detected fields `and force_types`
        fields = OrderedDict(
            [
                (field_name, detected_fields.get(field_name, TextField))
                for field_name in header + new_fields
            ]
        )
        fields.update(force_types)

        # Update `header` and `import_fields` based on new `fields`
        header = list(fields.keys())
        if import_fields is None:
            import_fields = header

    else:  # using provided field types
        if not isinstance(fields, OrderedDict):
            raise ValueError("`fields` must be an `OrderedDict`")

        if skip_header:
            # If we're skipping the header probably this row is not trustable
            # (can be data or garbage).
            _ = next(table_rows)

        header = make_header(list(fields.keys()))
        if import_fields is None:
            import_fields = header

        fields = OrderedDict(
            [(field_name, fields[key]) for field_name, key in zip(header, fields)]
        )

    diff = set(import_fields) - set(header)
    if diff:
        field_names = ", ".join('"{}"'.format(field) for field in diff)
        raise ValueError("Invalid field names: {}".format(field_names))
    fields = OrderedDict(
        [(field_name, fields[field_name]) for field_name in import_fields]
    )

    get_row = get_items(*map(header.index, import_fields))
    table = Table(fields=fields, meta=meta)
    table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows)

    return table