def create_table(data, meta=None, force_headers=None, fields=None, skip_header=True, *args, **kwargs): # TODO: add auto_detect_types=True parameter table_rows = list(data) if fields is None: if force_headers is None: header = make_header(table_rows[0]) table_rows = table_rows[1:] else: header = force_headers fields = detect_types(header, table_rows, *args, **kwargs) else: if skip_header: table_rows = table_rows[1:] header = make_header(fields.keys()) assert type(fields) is collections.OrderedDict fields = {field_name: fields[key] for field_name, key in zip(header, fields)} else: header = make_header(fields.keys()) # TODO: may reuse max_columns from html max_columns = max(len(row) for row in table_rows) assert len(fields) == max_columns # TODO: put this inside Table.__init__ table = Table(fields=fields, meta=meta) for row in table_rows: table.append({field_name: value for field_name, value in zip(header, row)}) return table
def test_detect_types_binary(self): # first, try values as (`bytes`/`str`) expected = {key: fields.BinaryField for key in self.expected.keys()} values = [[value.encode('utf-8') for value in row] for row in self.data] result = fields.detect_types(self.fields, values) self.assertDictEqual(dict(result), expected) # second, try base64-encoded values (as `str`/`unicode`) expected = {key: fields.TextField for key in self.expected.keys()} values = [[b64encode(value.encode('utf-8')).decode('ascii') for value in row] for row in self.data] result = fields.detect_types(self.fields, values) self.assertDictEqual(dict(result), expected)
def __setitem__(self, key, value): key_type = type(key) if key_type == int: self._rows[key] = self._make_row(value) elif key_type is six.text_type: from rows import fields from rows.plugins import utils values = list(value) # I'm not lazy, sorry if len(values) != len(self): raise ValueError( "Values length ({}) should be the same as " "Table length ({})".format(len(values), len(self)) ) field_name = utils.slug(key) is_new_field = field_name not in self.field_names field_type = fields.detect_types( [field_name], [[value] for value in values] )[field_name] self.fields[field_name] = field_type self.Row = namedtuple("Row", self.field_names) if is_new_field: for row, value in zip(self._rows, values): row.append(field_type.deserialize(value)) else: field_index = self.field_names.index(field_name) for row, value in zip(self._rows, values): row[field_index] = field_type.deserialize(value) else: raise ValueError("Unsupported key type: {}".format(type(key).__name__))
def __setitem__(self, key, value): key_type = type(key) if key_type == int: self._rows[key] = self._make_row(value) elif key_type == unicode: # TODO: change to 'str' on Python3 values = list(value) # I'm not lazy, sorry if len(values) != len(self): raise ValueError('Values length ({}) should be the same as ' 'Table length ({})' .format(len(values), len(self))) from rows.fields import detect_types from rows.utils import slug field_name = slug(key) is_new_field = field_name not in self.field_names field_type = detect_types([field_name], [[value] for value in values])[field_name] self.fields[field_name] = field_type self.Row = namedtuple('Row', self.field_names) if is_new_field: for row, value in zip(self._rows, values): row.append(field_type.deserialize(value)) else: field_index = self.field_names.index(field_name) for row, value in zip(self._rows, values): row[field_index] = field_type.deserialize(value) else: raise ValueError('Unsupported key type: {}' .format(type(key).__name__))
def __setitem__(self, key, value): key_type = type(key) if key_type == int: self._rows[key] = self._make_row(value) elif key_type is six.text_type: from rows import fields from rows.plugins import utils values = list(value) # I'm not lazy, sorry if len(values) != len(self): raise ValueError('Values length ({}) should be the same as ' 'Table length ({})'.format( len(values), len(self))) field_name = utils.slug(key) is_new_field = field_name not in self.field_names field_type = fields.detect_types([field_name], [[value] for value in values])[field_name] self.fields[field_name] = field_type self.Row = namedtuple('Row', self.field_names) if is_new_field: for row, value in zip(self._rows, values): row.append(field_type.deserialize(value)) else: field_index = self.field_names.index(field_name) for row, value in zip(self._rows, values): row[field_index] = field_type.deserialize(value) else: raise ValueError('Unsupported key type: {}'.format( type(key).__name__))
def test_precedence(self): field_types = [ ('bool', fields.BoolField), ('integer', fields.IntegerField), ('float', fields.FloatField), ('datetime', fields.DatetimeField), ('date', fields.DateField), ('float', fields.FloatField), ('percent', fields.PercentField), ('json', fields.JSONField), ('email', fields.EmailField), ('binary1', fields.BinaryField), ('binary2', fields.BinaryField), ('text', fields.TextField), ] data = [ [ 'false', '42', '3.14', '2016-08-15T05:21:10', '2016-08-15', '2.71', '76.38%', '{"key": "value"}', '*****@*****.**', b'cHl0aG9uIHJ1bGVz', b'python rules', 'Álvaro Justen' ] ] result = fields.detect_types([item[0] for item in field_types], data) self.assertDictEqual(dict(result), dict(field_types))
def test_detect_types_utf8(self): result = fields.detect_types( self.fields, self.data, encoding='utf-8' ) self.assertEqual(type(result), collections.OrderedDict) self.assertEqual(result.keys(), self.fields) self.assertDictEqual(dict(result), self.expected)
def test_detect_types_binary(self): # first, try values as (`bytes`/`str`) expected = {key: fields.BinaryField for key in self.expected.keys()} values = [ [b"some binary data" for _ in range(len(self.data[0]))] for __ in range(20) ] result = fields.detect_types(self.fields, values) self.assertDictEqual(dict(result), expected) # second, try base64-encoded values (as `str`/`unicode`) expected = {key: fields.TextField for key in self.expected.keys()} values = [ [b64encode(value.encode("utf-8")).decode("ascii") for value in row] for row in self.data ] result = fields.detect_types(self.fields, values) self.assertDictEqual(dict(result), expected)
def create_table(data, meta=None, fields=None, skip_header=True, import_fields=None, samples=None, force_types=None, *args, **kwargs): # TODO: add auto_detect_types=True parameter table_rows = iter(data) sample_rows = [] if fields is None: header = make_header(next(table_rows)) if samples is not None: sample_rows = list(islice(table_rows, 0, samples)) else: sample_rows = list(table_rows) fields = detect_types(header, sample_rows, *args, **kwargs) if force_types is not None: # TODO: optimize field detection (ignore fields on `force_types`) for field_name, field_type in force_types.items(): fields[field_name] = field_type else: if not isinstance(fields, OrderedDict): raise ValueError('`fields` must be an `OrderedDict`') if skip_header: _ = next(table_rows) header = make_header(list(fields.keys())) fields = OrderedDict([(field_name, fields[key]) for field_name, key in zip(header, fields)]) if import_fields is not None: # TODO: can optimize if import_fields is not None. # Example: do not detect all columns import_fields = make_header(import_fields) diff = set(import_fields) - set(header) if diff: field_names = ', '.join('"{}"'.format(field) for field in diff) raise ValueError("Invalid field names: {}".format(field_names)) new_fields = OrderedDict() for field_name in import_fields: new_fields[field_name] = fields[field_name] fields = new_fields table = Table(fields=fields, meta=meta) # TODO: put this inside Table.__init__ for row in chain(sample_rows, table_rows): table.append({field_name: value for field_name, value in zip(header, row)}) return table
def test_precedence(self): field_types = [ ("bool", fields.BoolField), ("integer", fields.IntegerField), ("float", fields.FloatField), ("datetime", fields.DatetimeField), ("date", fields.DateField), ("float", fields.FloatField), ("percent", fields.PercentField), ("json", fields.JSONField), ("email", fields.EmailField), ("binary1", fields.BinaryField), ("binary2", fields.BinaryField), ("text", fields.TextField), ] data = [ [ "false", "42", "3.14", "2016-08-15T05:21:10", "2016-08-15", "2.71", "76.38%", '{"key": "value"}', "*****@*****.**", b"cHl0aG9uIHJ1bGVz", b"python rules", "Álvaro Justen", ] ] result = fields.detect_types( [item[0] for item in field_types], data, field_types=[item[1] for item in field_types], ) self.assertDictEqual(dict(result), dict(field_types))
def test_detect_types(self): result = fields.detect_types(self.fields, self.data) self.assertDictEqual(dict(result), self.expected)
def test_detect_types_no_sample(self): expected = {key: fields.BinaryField for key in self.expected.keys()} result = fields.detect_types(self.fields, []) self.assertDictEqual(dict(result), expected)
def create_table(data, meta=None, fields=None, skip_header=True, import_fields=None, samples=None, force_types=None, *args, **kwargs): # TODO: add auto_detect_types=True parameter table_rows = iter(data) sample_rows = [] if fields is None: header = make_header(next(table_rows)) if samples is not None: sample_rows = list(islice(table_rows, 0, samples)) else: sample_rows = list(table_rows) fields = detect_types(header, sample_rows, *args, **kwargs) if force_types is not None: # TODO: optimize field detection (ignore fields on `force_types`) for field_name, field_type in force_types.items(): fields[field_name] = field_type else: if not isinstance(fields, OrderedDict): raise ValueError('`fields` must be an `OrderedDict`') if skip_header: next(table_rows) header = make_header(list(fields.keys())) fields = OrderedDict([(field_name, fields[key]) for field_name, key in zip(header, fields)]) if import_fields is not None: # TODO: can optimize if import_fields is not None. # Example: do not detect all columns import_fields = make_header(import_fields) diff = set(import_fields) - set(header) if diff: field_names = ', '.join('"{}"'.format(field) for field in diff) raise ValueError("Invalid field names: {}".format(field_names)) new_fields = OrderedDict() for field_name in import_fields: new_fields[field_name] = fields[field_name] fields = new_fields table = Table(fields=fields, meta=meta) # TODO: put this inside Table.__init__ for row in chain(sample_rows, table_rows): table.append( {field_name: value for field_name, value in zip(header, row)}) return table
def test_detect_types_unicode(self): data = [[field.decode('utf-8') for field in row] for row in self.data] result = fields.detect_types(self.fields, data) self.assertDictEqual(dict(result), self.expected)
def test_detect_types_different_number_of_fields(self): result = fields.detect_types(["f1", "f2"], [["a", "b", "c"]]) self.assertEquals(list(result.keys()), ["f1", "f2", "field_2"])
def create_table(data, meta=None, fields=None, skip_header=True, import_fields=None, samples=None, force_types=None, *args, **kwargs): """Create a rows.Table object based on data rows and some configurations - `skip_header` is only used if `fields` is set - `samples` is only used if `fields` is `None`. If samples=None, all data is filled in memory - use with caution. - `force_types` is only used if `fields` is `None` - `import_fields` can be used either if `fields` is set or not, the resulting fields will seek its order - `fields` must always be in the same order as the data """ table_rows = iter(data) force_types = force_types or {} if import_fields is not None: import_fields = make_header(import_fields) if fields is None: # autodetect field types # TODO: may add `type_hints` parameter so autodetection can be easier # (plugins may specify some possible field types). header = make_header(next(table_rows)) if samples is not None: sample_rows = list(islice(table_rows, 0, samples)) table_rows = chain(sample_rows, table_rows) else: sample_rows = table_rows = list(table_rows) # Detect field types using only the desired columns detected_fields = detect_types( header, sample_rows, skip_indexes=[ index for index, field in enumerate(header) if field in force_types or field not in ( import_fields or header) ], *args, **kwargs) # Check if any field was added during detecting process new_fields = [ field_name for field_name in detected_fields.keys() if field_name not in header ] # Finally create the `fields` with both header and new field names, # based on detected fields `and force_types` fields = OrderedDict([(field_name, detected_fields.get(field_name, TextField)) for field_name in header + new_fields]) fields.update(force_types) # Update `header` and `import_fields` based on new `fields` header = list(fields.keys()) if import_fields is None: import_fields = header else: # using provided field types if not isinstance(fields, OrderedDict): raise ValueError("`fields` must be an `OrderedDict`") if skip_header: # If we're skipping the header probably this row is not trustable # (can be data or garbage). _ = next(table_rows) header = make_header(list(fields.keys())) if import_fields is None: import_fields = header fields = OrderedDict([(field_name, fields[key]) for field_name, key in zip(header, fields)]) diff = set(import_fields) - set(header) if diff: field_names = ", ".join('"{}"'.format(field) for field in diff) raise ValueError("Invalid field names: {}".format(field_names)) fields = OrderedDict([(field_name, fields[field_name]) for field_name in import_fields]) get_row = get_items(*map(header.index, import_fields)) table = Table(fields=fields, meta=meta) table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows) return table
def create_table( data, meta=None, fields=None, skip_header=True, import_fields=None, samples=None, force_types=None, *args, **kwargs ): """Create a rows.Table object based on data rows and some configurations - `skip_header` is only used if `fields` is set - `samples` is only used if `fields` is `None`. If samples=None, all data is filled in memory - use with caution. - `force_types` is only used if `fields` is `None` - `import_fields` can be used either if `fields` is set or not, the resulting fields will seek its order - `fields` must always be in the same order as the data """ table_rows = iter(data) force_types = force_types or {} if import_fields is not None: import_fields = make_header(import_fields) if fields is None: # autodetect field types # TODO: may add `type_hints` parameter so autodetection can be easier # (plugins may specify some possible field types). header = make_header(next(table_rows)) if samples is not None: sample_rows = list(islice(table_rows, 0, samples)) table_rows = chain(sample_rows, table_rows) else: sample_rows = table_rows = list(table_rows) # Detect field types using only the desired columns detected_fields = detect_types( header, sample_rows, skip_indexes=[ index for index, field in enumerate(header) if field in force_types or field not in (import_fields or header) ], *args, **kwargs ) # Check if any field was added during detecting process new_fields = [ field_name for field_name in detected_fields.keys() if field_name not in header ] # Finally create the `fields` with both header and new field names, # based on detected fields `and force_types` fields = OrderedDict( [ (field_name, detected_fields.get(field_name, TextField)) for field_name in header + new_fields ] ) fields.update(force_types) # Update `header` and `import_fields` based on new `fields` header = list(fields.keys()) if import_fields is None: import_fields = header else: # using provided field types if not isinstance(fields, OrderedDict): raise ValueError("`fields` must be an `OrderedDict`") if skip_header: # If we're skipping the header probably this row is not trustable # (can be data or garbage). _ = next(table_rows) header = make_header(list(fields.keys())) if import_fields is None: import_fields = header fields = OrderedDict( [(field_name, fields[key]) for field_name, key in zip(header, fields)] ) diff = set(import_fields) - set(header) if diff: field_names = ", ".join('"{}"'.format(field) for field in diff) raise ValueError("Invalid field names: {}".format(field_names)) fields = OrderedDict( [(field_name, fields[field_name]) for field_name in import_fields] ) get_row = get_items(*map(header.index, import_fields)) table = Table(fields=fields, meta=meta) table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows) return table