def test_limit(self): rows = [('1.7', ), ('foo', ), ('', )] tester = TypeTester(limit=1) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) tester = TypeTester(limit=2) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def test_types_force_text(self): rows = [('1.7', ), ('200000000', ), ('', )] tester = TypeTester(types=[Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def test_types_no_boolean(self): rows = [('True', ), ('False', ), ('False', )] tester = TypeTester(types=[Number(), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def test_limit(self): rows = [ ('1.7',), ('foo',), ('',) ] tester = TypeTester(limit=1) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) tester = TypeTester(limit=2) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def test_force_type(self): rows = [('1.7', ), ('200000000', ), ('', )] tester = TypeTester(force={'one': Text()}) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def test_types_number_locale(self): rows = [('1,7', ), ('200.000.000', ), ('', )] tester = TypeTester(types=[Number(locale='de_DE.UTF-8'), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) self.assertEqual(str(inferred[0].locale), 'de_DE')
def test_types_no_boolean(self): rows = [ ('True',), ('False',), ('False',) ] tester = TypeTester(types=[Number(), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def test_types_force_text(self): rows = [ ('1.7',), ('200000000',), ('',) ] tester = TypeTester(types=[Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def test_types_number_locale(self): rows = [ ('1,7',), ('200.000.000',), ('',) ] tester = TypeTester(types=[Number(locale='de_DE'), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) self.assertEqual(inferred[0].locale, 'de_DE')
def test_force_type(self): rows = [ ('1.7',), ('200000000',), ('',) ] tester = TypeTester(force={ 'one': Text() }) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text)
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None): """ Create a new table with columns converted into rows values. For example: +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ can be normalized on columns 'gender', 'race' and 'age': +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ This is the opposite of :meth:`.Table.denormalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized self. Typically these are the tables unique identifiers and any metadata about them. :param properties: A column name or a sequence of column names that should be converted to properties in the new self. :param property_column: The name to use for the column containing the property names. :param value_column: The name to use for the column containing the property values. :param column_types: A sequence of two column types for the property and value column in that order or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table new_rows = [] if not utils.issequence(key): key = [key] if not utils.issequence(properties): properties = [properties] new_column_names = key + [property_column, value_column] row_names = [] for row in self.rows: k = tuple(row[n] for n in key) left_row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in properties: new_rows.append( Row(tuple(left_row + [f, row[f]]), new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None): """ Create a new table with columns converted into rows values. For example: +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ can be normalized on columns 'gender', 'race' and 'age': +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ This is the opposite of :meth:`.Table.denormalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized self. Typically these are the tables unique identifiers and any metadata about them. :param properties: A column name or a sequence of column names that should be converted to properties in the new self. :param property_column: The name to use for the column containing the property names. :param value_column: The name to use for the column containing the property values. :param column_types: A sequence of two column types for the property and value column in that order or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table new_rows = [] if not utils.issequence(key): key = [key] if not utils.issequence(properties): properties = [properties] new_column_names = key + [property_column, value_column] row_names = [] for row in self._rows: k = tuple(row[n] for n in key) left_row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in properties: new_rows.append(Row((left_row + [f, row[f]]), new_column_names)) key_column_types = [self._column_types[self._column_names.index(name)] for name in key] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?' ) # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn( 'Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError('Column names must be strings or None.') final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + '_' + str( duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in six.itervalues(column_types): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append( Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate( zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
class TestTypeTester(unittest.TestCase): def setUp(self): self.tester = TypeTester() def test_text_type(self): rows = [ ('a',), ('b',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_number_type(self): rows = [ ('1.7',), ('200000000',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_number_percent(self): rows = [ ('1.7%',), ('200000000%',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_number_currency(self): rows = [ ('$1.7',), ('$200000000',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_number_currency_locale(self): rows = [ (u'£1.7',), (u'£200000000',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_boolean_type(self): rows = [ ('True',), ('FALSE',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Boolean) def test_date_type(self): rows = [ ('5/7/1984',), ('2/28/1997',), ('3/19/2020',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Date) def test_date_type_iso_format(self): rows = [ ('1984-05-07',), ('1997-02-28',), ('2020-03-19',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Date) def test_date_time_type(self): rows = [ ('5/7/84 3:44:12',), ('2/28/1997 3:12 AM',), ('3/19/20 4:40 PM',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], DateTime) def test_date_time_type_isoformat(self): rows = [ ('1984-07-05T03:44:12',), ('1997-02-28T03:12:00',), ('2020-03-19T04:40:00',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], DateTime) def test_time_delta_type(self): rows = [ ('1:42',), ('1w 27h',), ('',) ] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], TimeDelta) def test_force_type(self): rows = [ ('1.7',), ('200000000',), ('',) ] tester = TypeTester(force={ 'one': Text() }) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_limit(self): rows = [ ('1.7',), ('foo',), ('',) ] tester = TypeTester(limit=1) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) tester = TypeTester(limit=2) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_types_force_text(self): rows = [ ('1.7',), ('200000000',), ('',) ] tester = TypeTester(types=[Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_types_no_boolean(self): rows = [ ('True',), ('False',), ('False',) ] tester = TypeTester(types=[Number(), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_types_number_locale(self): rows = [ ('1,7',), ('200.000.000',), ('',) ] tester = TypeTester(types=[Number(locale='de_DE'), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) self.assertEqual(inferred[0].locale, 'de_DE')
class TestTypeTester(unittest.TestCase): def setUp(self): self.tester = TypeTester() def test_empty(self): rows = [ (None, ), (None, ), (None, ), ] inferred = self.tester.run(rows, ['one']) # This behavior is not necessarily desirable. See https://github.com/wireservice/agate/issues/371 self.assertIsInstance(inferred[0], Boolean) def test_text_type(self): rows = [('a', ), ('b', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_number_type(self): rows = [('1.7', ), ('200000000', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_number_percent(self): rows = [('1.7%', ), ('200000000%', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_number_currency(self): rows = [('$1.7', ), ('$200000000', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_number_currency_locale(self): rows = [(u'£1.7', ), (u'£200000000', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) def test_boolean_type(self): rows = [('True', ), ('FALSE', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Boolean) def test_date_type(self): rows = [('5/7/1984', ), ('2/28/1997', ), ('3/19/2020', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Date) def test_date_type_iso_format(self): rows = [('1984-05-07', ), ('1997-02-28', ), ('2020-03-19', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Date) def test_date_time_type(self): rows = [('5/7/84 3:44:12', ), ('2/28/1997 3:12 AM', ), ('3/19/20 4:40 PM', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], DateTime) def test_date_time_type_isoformat(self): rows = [('1984-07-05T03:44:12', ), ('1997-02-28T03:12:00', ), ('2020-03-19T04:40:00', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], DateTime) def test_time_delta_type(self): rows = [('1:42', ), ('1w 27h', ), ('', )] inferred = self.tester.run(rows, ['one']) self.assertIsInstance(inferred[0], TimeDelta) def test_force_type(self): rows = [('1.7', ), ('200000000', ), ('', )] tester = TypeTester(force={'one': Text()}) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_limit(self): rows = [('1.7', ), ('foo', ), ('', )] tester = TypeTester(limit=1) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) tester = TypeTester(limit=2) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_types_force_text(self): rows = [('1.7', ), ('200000000', ), ('', )] tester = TypeTester(types=[Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_types_no_boolean(self): rows = [('True', ), ('False', ), ('False', )] tester = TypeTester(types=[Number(), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Text) def test_types_number_locale(self): rows = [('1,7', ), ('200.000.000', ), ('', )] tester = TypeTester(types=[Number(locale='de_DE.UTF-8'), Text()]) inferred = tester.run(rows, ['one']) self.assertIsInstance(inferred[0], Number) self.assertEqual(str(inferred[0].locale), 'de_DE')
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. ' 'Did you want agate.Table.from_csv?') # Validate column names if column_names: self._column_names = utils.deduplicate(column_names, column_names=True) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError( str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError( 'Row names cannot be of type int. Use Decimal for numbered row names.' ) self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None): """ Create a new table with row values converted into columns. For example: +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ Can be denormalized so that each unique value in `field` becomes a column with `value` used for its values. +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ If one or more keys are specified then the resulting table will automatically have :code:`row_names` set to those keys. This is the opposite of :meth:`.Table.normalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized table. Typically these are the tables unique identifiers and any metadata about them. Or, :code:`None` if there are no key columns. :param field_column: The column whose values should become column names in the new table. :param property_column: The column whose values should become the values of the property columns in the new table. :param default_value: Value to be used for missing values in the pivot table. If not specified :code:`Decimal(0)` will be used for aggregations that return :class:`.Number` data and :code:`None` will be used for all others. :param column_types: A sequence of column types with length equal to number of unique values in field_column or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table if key is None: key = [] elif not utils.issequence(key): key = [key] field_names = [] row_data = OrderedDict() for row in self.rows: row_key = tuple(row[k] for k in key) if row_key not in row_data: row_data[row_key] = OrderedDict() f = six.text_type(row[property_column]) v = row[value_column] if f not in field_names: field_names.append(f) row_data[row_key][f] = v if default_value == utils.default: if isinstance(self.columns[value_column].data_type, Number): default_value = Decimal(0) else: default_value = None new_column_names = key + field_names new_rows = [] row_names = [] for k, v in row_data.items(): row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in field_names: if f in v: row.append(v[f]) else: row.append(default_value) new_rows.append(Row(row, new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?') # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn('Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError('Column names must be strings or None.') final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + '_' + str(duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in six.itervalues(column_types): if not isinstance(v, DataType): raise ValueError('Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError('column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None): """ Create a new table with row values converted into columns. For example: +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ Can be denormalized so that each unique value in `field` becomes a column with `value` used for its values. +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ If one or more keys are specified then the resulting table will automatically have :code:`row_names` set to those keys. This is the opposite of :meth:`.Table.normalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized table. Typically these are the tables unique identifiers and any metadata about them. Or, :code:`None` if there are no key columns. :param field_column: The column whose values should become column names in the new table. :param property_column: The column whose values should become the values of the property columns in the new table. :param default_value: Value to be used for missing values in the pivot table. If not specified :code:`Decimal(0)` will be used for aggregations that return :class:`.Number` data and :code:`None` will be used for all others. :param column_types: A sequence of column types with length equal to number of unique values in field_column or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table if key is None: key = [] elif not utils.issequence(key): key = [key] field_names = [] row_data = OrderedDict() for row in self.rows: row_key = tuple(row[k] for k in key) if row_key not in row_data: row_data[row_key] = OrderedDict() f = six.text_type(row[property_column]) v = row[value_column] if f not in field_names: field_names.append(f) row_data[row_key][f] = v if default_value == utils.default: if isinstance(self.columns[value_column].data_type, Number): default_value = Decimal(0) else: default_value = None new_column_names = key + field_names new_rows = [] row_names = [] for k, v in row_data.items(): row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in field_names: if f in v: row.append(v[f]) else: row.append(default_value) new_rows.append(Row(row, new_column_names)) key_column_types = [self.column_types[self.column_names.index(name)] for name in key] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?') # Validate column names if column_names: self._column_names = utils.deduplicate(column_names, column_names=True) elif rows: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError('Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError('column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError(str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)