Example #1
0
    def test_limit(self):
        rows = [('1.7', ), ('foo', ), ('', )]

        tester = TypeTester(limit=1)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

        tester = TypeTester(limit=2)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #2
0
    def test_types_force_text(self):
        rows = [('1.7', ), ('200000000', ), ('', )]

        tester = TypeTester(types=[Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #3
0
    def test_types_no_boolean(self):
        rows = [('True', ), ('False', ), ('False', )]

        tester = TypeTester(types=[Number(), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #4
0
    def test_limit(self):
        rows = [
            ('1.7',),
            ('foo',),
            ('',)
        ]

        tester = TypeTester(limit=1)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

        tester = TypeTester(limit=2)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #5
0
    def test_force_type(self):
        rows = [('1.7', ), ('200000000', ), ('', )]

        tester = TypeTester(force={'one': Text()})

        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #6
0
    def test_types_number_locale(self):
        rows = [('1,7', ), ('200.000.000', ), ('', )]

        tester = TypeTester(types=[Number(locale='de_DE.UTF-8'), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)
        self.assertEqual(str(inferred[0].locale), 'de_DE')
Example #7
0
    def test_types_no_boolean(self):
        rows = [
            ('True',),
            ('False',),
            ('False',)
        ]

        tester = TypeTester(types=[Number(), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #8
0
    def test_types_force_text(self):
        rows = [
            ('1.7',),
            ('200000000',),
            ('',)
        ]

        tester = TypeTester(types=[Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #9
0
    def test_types_number_locale(self):
        rows = [
            ('1,7',),
            ('200.000.000',),
            ('',)
        ]

        tester = TypeTester(types=[Number(locale='de_DE'), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)
        self.assertEqual(inferred[0].locale, 'de_DE')
Example #10
0
    def test_force_type(self):
        rows = [
            ('1.7',),
            ('200000000',),
            ('',)
        ]

        tester = TypeTester(force={
            'one': Text()
        })

        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)
Example #11
0
def normalize(self,
              key,
              properties,
              property_column='property',
              value_column='value',
              column_types=None):
    """
    Create a new table with columns converted into rows values.

    For example:

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    can be normalized on columns 'gender', 'race' and 'age':

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    This is the opposite of :meth:`.Table.denormalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized self. Typically these
        are the tables unique identifiers and any metadata about them.
    :param properties:
        A column name or a sequence of column names that should be
        converted to properties in the new self.
    :param property_column:
        The name to use for the column containing the property names.
    :param value_column:
        The name to use for the column containing the property values.
    :param column_types:
        A sequence of two column types for the property and value column in
        that order or an instance of :class:`.TypeTester`. Defaults to a
        generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    new_rows = []

    if not utils.issequence(key):
        key = [key]

    if not utils.issequence(properties):
        properties = [properties]

    new_column_names = key + [property_column, value_column]

    row_names = []

    for row in self.rows:
        k = tuple(row[n] for n in key)
        left_row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in properties:
            new_rows.append(
                Row(tuple(left_row + [f, row[f]]), new_column_names))

    key_column_types = [
        self.column_types[self.column_names.index(name)] for name in key
    ]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows,
                 new_column_names,
                 new_column_types,
                 row_names=row_names)
Example #12
0
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None):
    """
    Create a new table with columns converted into rows values.

    For example:

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    can be normalized on columns 'gender', 'race' and 'age':

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    This is the opposite of :meth:`.Table.denormalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized self. Typically these
        are the tables unique identifiers and any metadata about them.
    :param properties:
        A column name or a sequence of column names that should be
        converted to properties in the new self.
    :param property_column:
        The name to use for the column containing the property names.
    :param value_column:
        The name to use for the column containing the property values.
    :param column_types:
        A sequence of two column types for the property and value column in
        that order or an instance of :class:`.TypeTester`. Defaults to a
        generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    new_rows = []

    if not utils.issequence(key):
        key = [key]

    if not utils.issequence(properties):
        properties = [properties]

    new_column_names = key + [property_column, value_column]

    row_names = []

    for row in self._rows:
        k = tuple(row[n] for n in key)
        left_row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in properties:
            new_rows.append(Row((left_row + [f, row[f]]), new_column_names))

    key_column_types = [self._column_types[self._column_names.index(name)] for name in key]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
Example #13
0
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError(
                'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?'
            )

        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    new_column_name = utils.letter_name(i)
                    warnings.warn(
                        'Column name not specified. "%s" will be used as name.'
                        % new_column_name, RuntimeWarning)
                elif isinstance(column_name, six.string_types):
                    new_column_name = column_name
                else:
                    raise ValueError('Column names must be strings or None.')

                final_column_name = new_column_name
                duplicates = 0
                while final_column_name in final_column_names:
                    final_column_name = new_column_name + '_' + str(
                        duplicates + 2)
                    duplicates += 1

                if duplicates > 0:
                    warn_duplicate_column(new_column_name, final_column_name)

                final_column_names.append(final_column_name)

            self._column_names = tuple(final_column_names)
        elif rows:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn(
                'Column names not specified. "%s" will be used as names.' %
                str(self._column_names),
                RuntimeWarning,
                stacklevel=2)
        else:
            self._column_names = []

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()

        elif isinstance(column_types, dict):
            for v in six.itervalues(column_types):
                if not isinstance(v, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')
            column_types = TypeTester(force=column_types)

        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row,
                                [None] * (len(self.column_names) - len_row))

                new_rows.append(
                    Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)),
                        self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(
                zip(self._column_names, self._column_types)):
            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
Example #14
0
class TestTypeTester(unittest.TestCase):
    def setUp(self):
        self.tester = TypeTester()

    def test_text_type(self):
        rows = [
            ('a',),
            ('b',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_number_type(self):
        rows = [
            ('1.7',),
            ('200000000',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_number_percent(self):
        rows = [
            ('1.7%',),
            ('200000000%',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_number_currency(self):
        rows = [
            ('$1.7',),
            ('$200000000',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_number_currency_locale(self):
        rows = [
            (u'£1.7',),
            (u'£200000000',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_boolean_type(self):
        rows = [
            ('True',),
            ('FALSE',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Boolean)

    def test_date_type(self):
        rows = [
            ('5/7/1984',),
            ('2/28/1997',),
            ('3/19/2020',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Date)

    def test_date_type_iso_format(self):
        rows = [
            ('1984-05-07',),
            ('1997-02-28',),
            ('2020-03-19',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Date)

    def test_date_time_type(self):
        rows = [
            ('5/7/84 3:44:12',),
            ('2/28/1997 3:12 AM',),
            ('3/19/20 4:40 PM',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], DateTime)

    def test_date_time_type_isoformat(self):
        rows = [
            ('1984-07-05T03:44:12',),
            ('1997-02-28T03:12:00',),
            ('2020-03-19T04:40:00',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], DateTime)

    def test_time_delta_type(self):
        rows = [
            ('1:42',),
            ('1w 27h',),
            ('',)
        ]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], TimeDelta)

    def test_force_type(self):
        rows = [
            ('1.7',),
            ('200000000',),
            ('',)
        ]

        tester = TypeTester(force={
            'one': Text()
        })

        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_limit(self):
        rows = [
            ('1.7',),
            ('foo',),
            ('',)
        ]

        tester = TypeTester(limit=1)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

        tester = TypeTester(limit=2)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_types_force_text(self):
        rows = [
            ('1.7',),
            ('200000000',),
            ('',)
        ]

        tester = TypeTester(types=[Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_types_no_boolean(self):
        rows = [
            ('True',),
            ('False',),
            ('False',)
        ]

        tester = TypeTester(types=[Number(), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_types_number_locale(self):
        rows = [
            ('1,7',),
            ('200.000.000',),
            ('',)
        ]

        tester = TypeTester(types=[Number(locale='de_DE'), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)
        self.assertEqual(inferred[0].locale, 'de_DE')
Example #15
0
class TestTypeTester(unittest.TestCase):
    def setUp(self):
        self.tester = TypeTester()

    def test_empty(self):
        rows = [
            (None, ),
            (None, ),
            (None, ),
        ]

        inferred = self.tester.run(rows, ['one'])

        # This behavior is not necessarily desirable. See https://github.com/wireservice/agate/issues/371
        self.assertIsInstance(inferred[0], Boolean)

    def test_text_type(self):
        rows = [('a', ), ('b', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_number_type(self):
        rows = [('1.7', ), ('200000000', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_number_percent(self):
        rows = [('1.7%', ), ('200000000%', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_number_currency(self):
        rows = [('$1.7', ), ('$200000000', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_number_currency_locale(self):
        rows = [(u'£1.7', ), (u'£200000000', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

    def test_boolean_type(self):
        rows = [('True', ), ('FALSE', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Boolean)

    def test_date_type(self):
        rows = [('5/7/1984', ), ('2/28/1997', ), ('3/19/2020', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Date)

    def test_date_type_iso_format(self):
        rows = [('1984-05-07', ), ('1997-02-28', ), ('2020-03-19', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Date)

    def test_date_time_type(self):
        rows = [('5/7/84 3:44:12', ), ('2/28/1997 3:12 AM', ),
                ('3/19/20 4:40 PM', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], DateTime)

    def test_date_time_type_isoformat(self):
        rows = [('1984-07-05T03:44:12', ), ('1997-02-28T03:12:00', ),
                ('2020-03-19T04:40:00', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], DateTime)

    def test_time_delta_type(self):
        rows = [('1:42', ), ('1w 27h', ), ('', )]

        inferred = self.tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], TimeDelta)

    def test_force_type(self):
        rows = [('1.7', ), ('200000000', ), ('', )]

        tester = TypeTester(force={'one': Text()})

        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_limit(self):
        rows = [('1.7', ), ('foo', ), ('', )]

        tester = TypeTester(limit=1)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)

        tester = TypeTester(limit=2)
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_types_force_text(self):
        rows = [('1.7', ), ('200000000', ), ('', )]

        tester = TypeTester(types=[Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_types_no_boolean(self):
        rows = [('True', ), ('False', ), ('False', )]

        tester = TypeTester(types=[Number(), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Text)

    def test_types_number_locale(self):
        rows = [('1,7', ), ('200.000.000', ), ('', )]

        tester = TypeTester(types=[Number(locale='de_DE.UTF-8'), Text()])
        inferred = tester.run(rows, ['one'])

        self.assertIsInstance(inferred[0], Number)
        self.assertEqual(str(inferred[0].locale), 'de_DE')
Example #16
0
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError(
                'When created directly, the first argument to Table must be a sequence of rows. '
                'Did you want agate.Table.from_csv?')

        # Validate column names
        if column_names:
            self._column_names = utils.deduplicate(column_names,
                                                   column_names=True)
        elif rows:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn(
                'Column names not specified. "%s" will be used as names.' %
                str(self._column_names),
                RuntimeWarning,
                stacklevel=2)
        else:
            self._column_names = tuple()

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, dict):
            for v in column_types.values():
                if not isinstance(v, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

            column_types = TypeTester(force=column_types)
        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len_column_names - len_row))

                row_values = []
                for j, d in enumerate(row):
                    try:
                        row_values.append(cast_funcs[j](d))
                    except CastError as e:
                        raise CastError(
                            str(e) + ' Error at row %s column %s.' %
                            (i, self._column_names[j]))

                new_rows.append(Row(row_values, self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            for row_name in computed_row_names:
                if type(row_name) is int:
                    raise ValueError(
                        'Row names cannot be of type int. Use Decimal for numbered row names.'
                    )

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i in range(len_column_names):
            name = self._column_names[i]
            data_type = self._column_types[i]

            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)

            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
Example #17
0
def denormalize(self,
                key=None,
                property_column='property',
                value_column='value',
                default_value=utils.default,
                column_types=None):
    """
    Create a new table with row values converted into columns.

    For example:

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    Can be denormalized so that each unique value in `field` becomes a
    column with `value` used for its values.

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    If one or more keys are specified then the resulting table will
    automatically have :code:`row_names` set to those keys.

    This is the opposite of :meth:`.Table.normalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized table. Typically these
        are the tables unique identifiers and any metadata about them. Or,
        :code:`None` if there are no key columns.
    :param field_column:
        The column whose values should become column names in the new table.
    :param property_column:
        The column whose values should become the values of the property
        columns in the new table.
    :param default_value:
        Value to be used for missing values in the pivot table. If not
        specified :code:`Decimal(0)` will be used for aggregations that
        return :class:`.Number` data and :code:`None` will be used for
        all others.
    :param column_types:
        A sequence of column types with length equal to number of unique
        values in field_column or an instance of :class:`.TypeTester`.
        Defaults to a generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    if key is None:
        key = []
    elif not utils.issequence(key):
        key = [key]

    field_names = []
    row_data = OrderedDict()

    for row in self.rows:
        row_key = tuple(row[k] for k in key)

        if row_key not in row_data:
            row_data[row_key] = OrderedDict()

        f = six.text_type(row[property_column])
        v = row[value_column]

        if f not in field_names:
            field_names.append(f)

        row_data[row_key][f] = v

    if default_value == utils.default:
        if isinstance(self.columns[value_column].data_type, Number):
            default_value = Decimal(0)
        else:
            default_value = None

    new_column_names = key + field_names

    new_rows = []
    row_names = []

    for k, v in row_data.items():
        row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in field_names:
            if f in v:
                row.append(v[f])
            else:
                row.append(default_value)

        new_rows.append(Row(row, new_column_names))

    key_column_types = [
        self.column_types[self.column_names.index(name)] for name in key
    ]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows,
                 new_column_names,
                 new_column_types,
                 row_names=row_names)
Example #18
0
    def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?')

        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    new_column_name = utils.letter_name(i)
                    warnings.warn('Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning)
                elif isinstance(column_name, six.string_types):
                    new_column_name = column_name
                else:
                    raise ValueError('Column names must be strings or None.')

                final_column_name = new_column_name
                duplicates = 0
                while final_column_name in final_column_names:
                    final_column_name = new_column_name + '_' + str(duplicates + 2)
                    duplicates += 1

                if duplicates > 0:
                    warn_duplicate_column(new_column_name, final_column_name)

                final_column_names.append(final_column_name)

            self._column_names = tuple(final_column_names)
        elif rows:
            self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2)
        else:
            self._column_names = []

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()

        elif isinstance(column_types, dict):
            for v in six.itervalues(column_types):
                if not isinstance(v, DataType):
                    raise ValueError('Column types must be instances of DataType.')
            column_types = TypeTester(force=column_types)

        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError('Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError('column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len(self.column_names) - len_row))

                new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError('row_names must be a column name, function or sequence')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)):
            column = Column(i, name, data_type, self._rows, row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)
Example #19
0
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None):
    """
    Create a new table with row values converted into columns.

    For example:

    +---------+-----------+---------+
    |  name   | property  | value   |
    +=========+===========+=========+
    |  Jane   | gender    | female  |
    +---------+-----------+---------+
    |  Jane   | race      | black   |
    +---------+-----------+---------+
    |  Jane   | age       | 24      |
    +---------+-----------+---------+
    |  ...    |  ...      |  ...    |
    +---------+-----------+---------+

    Can be denormalized so that each unique value in `field` becomes a
    column with `value` used for its values.

    +---------+----------+--------+-------+
    |  name   | gender   | race   | age   |
    +=========+==========+========+=======+
    |  Jane   | female   | black  | 24    |
    +---------+----------+--------+-------+
    |  Jack   | male     | white  | 35    |
    +---------+----------+--------+-------+
    |  Joe    | male     | black  | 28    |
    +---------+----------+--------+-------+

    If one or more keys are specified then the resulting table will
    automatically have :code:`row_names` set to those keys.

    This is the opposite of :meth:`.Table.normalize`.

    :param key:
        A column name or a sequence of column names that should be
        maintained as they are in the normalized table. Typically these
        are the tables unique identifiers and any metadata about them. Or,
        :code:`None` if there are no key columns.
    :param field_column:
        The column whose values should become column names in the new table.
    :param property_column:
        The column whose values should become the values of the property
        columns in the new table.
    :param default_value:
        Value to be used for missing values in the pivot table. If not
        specified :code:`Decimal(0)` will be used for aggregations that
        return :class:`.Number` data and :code:`None` will be used for
        all others.
    :param column_types:
        A sequence of column types with length equal to number of unique
        values in field_column or an instance of :class:`.TypeTester`.
        Defaults to a generic :class:`.TypeTester`.
    :returns:
        A new :class:`.Table`.
    """
    from agate.table import Table

    if key is None:
        key = []
    elif not utils.issequence(key):
        key = [key]

    field_names = []
    row_data = OrderedDict()

    for row in self.rows:
        row_key = tuple(row[k] for k in key)

        if row_key not in row_data:
            row_data[row_key] = OrderedDict()

        f = six.text_type(row[property_column])
        v = row[value_column]

        if f not in field_names:
            field_names.append(f)

        row_data[row_key][f] = v

    if default_value == utils.default:
        if isinstance(self.columns[value_column].data_type, Number):
            default_value = Decimal(0)
        else:
            default_value = None

    new_column_names = key + field_names

    new_rows = []
    row_names = []

    for k, v in row_data.items():
        row = list(k)

        if len(k) == 1:
            row_names.append(k[0])
        else:
            row_names.append(k)

        for f in field_names:
            if f in v:
                row.append(v[f])
            else:
                row.append(default_value)

        new_rows.append(Row(row, new_column_names))

    key_column_types = [self.column_types[self.column_names.index(name)] for name in key]

    if column_types is None or isinstance(column_types, TypeTester):
        tester = TypeTester() if column_types is None else column_types
        force_update = dict(zip(key, key_column_types))
        force_update.update(tester._force)
        tester._force = force_update

        new_column_types = tester.run(new_rows, new_column_names)
    else:
        new_column_types = key_column_types + list(column_types)

    return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
Example #20
0
    def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False):
        if isinstance(rows, six.string_types):
            raise ValueError('When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?')

        # Validate column names
        if column_names:
            self._column_names = utils.deduplicate(column_names, column_names=True)
        elif rows:
            self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0])))
            warnings.warn('Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2)
        else:
            self._column_names = tuple()

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, dict):
            for v in column_types.values():
                if not isinstance(v, DataType):
                    raise ValueError('Column types must be instances of DataType.')

            column_types = TypeTester(force=column_types)
        elif not isinstance(column_types, TypeTester):
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError('Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError('column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row, [None] * (len_column_names - len_row))

                row_values = []
                for j, d in enumerate(row):
                    try:
                        row_values.append(cast_funcs[j](d))
                    except CastError as e:
                        raise CastError(str(e) + ' Error at row %s column %s.' % (i, self._column_names[j]))

                new_rows.append(Row(row_values, self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif utils.issequence(row_names):
                computed_row_names = row_names
            else:
                raise ValueError('row_names must be a column name, function or sequence')

            for row_name in computed_row_names:
                if type(row_name) is int:
                    raise ValueError('Row names cannot be of type int. Use Decimal for numbered row names.')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i in range(len_column_names):
            name = self._column_names[i]
            data_type = self._column_types[i]

            column = Column(i, name, data_type, self._rows, row_names=self._row_names)

            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)