def test_max_length(self): rows = [ Row(['a'], ['test']), Row(['gobble'], ['test']), Row(['w'], ['test']), ] column = Column(0, 'test', Text(), rows) self.assertEqual(column.aggregate(MaxLength()), 6)
def test_max(self): rows = [ Row([datetime.datetime(1994, 3, 3, 6, 31)], ['test']), Row([datetime.datetime(1994, 3, 3, 6, 30, 30)], ['test']), Row([datetime.datetime(1994, 3, 3, 6, 30)], ['test']), ] column = Column(0, 'test', DateTime(), rows) self.assertIsInstance(Max().get_aggregate_data_type(column), DateTime) self.assertEqual(column.aggregate(Max()), datetime.datetime(1994, 3, 3, 6, 31))
def compute(self, computations): """ Compute new columns by applying one or more :class:`.Computation` to each row. :param computations: A sequence of pairs of new column names and :class:`.Computation` instances. :returns: A new :class:`Table`. """ column_names = list(copy(self._column_names)) column_types = list(copy(self._column_types)) for new_column_name, computation in computations: column_names.append(new_column_name) column_types.append(computation.get_computed_data_type(self)) computation.validate(self) new_columns = tuple(c.run(self) for n, c in computations) new_rows = [] for i, row in enumerate(self._rows): values = tuple(row) + tuple(c[i] for c in new_columns) new_rows.append(Row(values, column_names)) return self._fork(new_rows, column_names, column_types)
def compute(self, computations): """ Compute new columns by applying one or more :class:`.Computation` to each row. :param computations: An iterable of pairs of new column names and :class:`.Computation` instances. :returns: A new :class:`Table`. """ column_names = list(copy(self._column_names)) column_types = list(copy(self._column_types)) for computation, new_column_name in computations: if not isinstance(computation, Computation): raise ValueError('The first element in pair must be a Computation instance.') column_names.append(new_column_name) column_types.append(computation.get_computed_data_type(self)) computation.prepare(self) new_rows = [] for row in self._rows: new_columns = tuple(c.run(row) for c, n in computations) new_rows.append(Row(tuple(row) + new_columns, column_names)) return self._fork(new_rows, zip(column_names, column_types))
def merge(cls, tables): """ Merge an array of tables with identical columns into a single table. Each table must have exactly the same column types. Their column names need not be identical. The first table's column names will be the ones which are used. :param tables: An sequence of :class:`Table` instances. :returns: A new :class:`Table`. """ column_names = tables[0].column_names column_types = tables[0].column_types for table in tables[1:]: if table.column_types != column_types: raise ValueError( 'Only tables with identical column types may be merged.') rows = [] for table in tables: if table.column_names == column_names: rows.extend(table.rows) else: for row in table.rows: rows.append(Row(row.values(), column_names)) return Table(rows, column_names, column_types, row_names=tables[0].row_names, _is_fork=True)
def compute(self, computations, replace=False): """ Create a new table by applying one or more :class:`.Computation` instances to each row. :param computations: A sequence of pairs of new column names and :class:`.Computation` instances. :param replace: If :code:`True` then new column names can match existing names, and those columns will be replaced with the computed data. :returns: A new :class:`.Table`. """ column_names = list(copy(self._column_names)) column_types = list(copy(self._column_types)) for new_column_name, computation in computations: new_column_type = computation.get_computed_data_type(self) if new_column_name in column_names: if not replace: raise ValueError( 'New column name "%s" already exists. Specify replace=True to replace with computed data.' ) i = column_names.index(new_column_name) column_types[i] = new_column_type else: column_names.append(new_column_name) column_types.append(new_column_type) computation.validate(self) new_columns = OrderedDict() for new_column_name, computation in computations: new_columns[new_column_name] = computation.run(self) new_rows = [] for i, row in enumerate(self._rows): # Slow version if using replace if replace: values = [] for j, column_name in enumerate(column_names): if column_name in new_columns: values.append(new_columns[column_name][i]) else: values.append(row[j]) # Faster version if not using replace else: values = row.values() + tuple(c[i] for c in new_columns.values()) new_rows.append(Row(values, column_names)) return self._fork(new_rows, column_names, column_types)
def merge(self, groups=None, group_name=None, group_type=None): """ Convert this TableSet into a single table. This is the inverse of :meth:`.Table.group_by`. Any `row_names` set on the merged tables will be lost in this process. :param groups: A list of grouping factors to add to merged rows in a new column. If specified, it should have exactly one element per :class:`Table` in the :class:`TableSet`. If not specified or None, the grouping factor will be the name of the :class:`Row`'s original Table. :param group_name: This will be the column name of the grouping factors. If None, defaults to the :attr:`TableSet.key_name`. :param group_type: This will be the column type of the grouping factors. If None, defaults to the :attr:`TableSet.key_type`. :returns: A new :class:`Table`. """ if type(groups) is not list and groups is not None: raise ValueError('Groups must be None or a list.') if type(groups) is list and len(groups) != len(self): raise ValueError('Groups length must be equal to TableSet length.') column_names = list(self.column_names) column_types = list(self.column_types) column_names.insert(0, group_name if group_name else self.key_name) column_types.insert(0, group_type if group_type else self.key_type) rows = [] for index, (key, table) in enumerate(self.items()): for row in table.rows: if groups is None: rows.append(Row((key, ) + tuple(row), column_names)) else: rows.append( Row((groups[index], ) + tuple(row), column_names)) return Table(rows, column_names, column_types)
def merge(cls, tables, row_names=None, column_names=None): """ Create a new table from a sequence of similar tables. This method will not carry over row names from the merged tables, but new row names can be specified with the :code:`row_names` argument. It is possible to limit the columns included in the new :class:`.Table` with :code:`column_names` argument. For example, to only include columns from a specific table, set :code:`column_names` equal to :code:`table.column_names`. :param tables: An sequence of :class:`.Table` instances. :param row_names: See :class:`.Table` for the usage of this parameter. :param column_names: A sequence of column names to include in the new :class:`.Table`. If not specified, all distinct column names from `tables` are included. :returns: A new :class:`.Table`. """ from agate.table import Table new_columns = OrderedDict() for table in tables: for i in range(0, len(table.columns)): if column_names is None or table.column_names[i] in column_names: column_name = table.column_names[i] column_type = table.column_types[i] if column_name in new_columns: if not isinstance(column_type, type(new_columns[column_name])): raise DataTypeError('Tables contain columns with the same names, but different types.') else: new_columns[column_name] = column_type column_keys = new_columns.keys() column_types = new_columns.values() rows = [] for table in tables: # Performance optimization for identical table structures if table.column_names == column_keys and table.column_types == column_types: rows.extend(table.rows) else: for row in table.rows: data = [] for column_key in column_keys: data.append(row.get(column_key, None)) rows.append(Row(data, column_keys)) return Table(rows, column_keys, column_types, row_names=row_names, _is_fork=True)
def _get_row(self, i): """ Get a :class:`.Row` of data, caching a copy for the next request. """ if i not in self._cached_rows: # If rows are from a fork, they are safe to access directly if isinstance(self._data[i], Row): self._cached_rows[i] = self._data[i] else: self._cached_rows[i] = Row(self, i) return self._cached_rows[i]
def select(self, selected_names): """ Reduce this table to only the specified columns. :param selected_names: A sequence of names of columns to include in the new table. :returns: A new :class:`Table`. """ new_columns = [self.columns[name] for name in selected_names] new_rows = [] for row in self._rows: new_rows.append(Row(tuple(row[n] for n in selected_names), selected_names)) return self._fork(new_rows, new_columns)
def select(self, key): """ Create a new table with only the specified columns. :param key: Either the name of a single column to include or a sequence of such names. :returns: A new :class:`.Table`. """ if not utils.issequence(key): key = [key] column_types = [self.columns[name].data_type for name in key] new_rows = [] for row in self._rows: new_rows.append(Row(tuple(row[n] for n in key), key)) return self._fork(new_rows, key, column_types)
def select(self, key): """ Create a new table with only the specified columns. :param key: Either the name of a single column to include or a sequence of such names. :returns: A new :class:`.Table`. """ if not utils.issequence(key): key = [key] indexes = tuple(self._column_names.index(k) for k in key) column_types = tuple(self._column_types[i] for i in indexes) new_rows = [] for row in self._rows: new_rows.append(Row((row[i] for i in indexes), key)) return self._fork(new_rows, key, column_types)
def select(self, selected_column_names): """ Create a new table with the same rows as this one, but only those columns in the ``selected_column_names`` sequence. :param selected_column_names: A sequence of names of columns to include in the new table. :returns: A new :class:`Table`. """ column_types = [ self.columns[name].data_type for name in selected_column_names ] new_rows = [] for row in self._rows: new_rows.append( Row(tuple(row[n] for n in selected_column_names), selected_column_names)) return self._fork(new_rows, selected_column_names, column_types)
def merge(cls, tables, row_names=None): """ Merge an array of tables with identical columns into a single table. Each table must have exactly the same column types. Their column names need not be identical. The first table's column names will be the ones which are used. Row names will be lost, but new row names can be specified with the `row_names` argument. :param tables: An sequence of :class:`Table` instances. :param row_names: See :class:`Table` for the usage of this parameter. :returns: A new :class:`Table`. """ column_names = tables[0].column_names column_types = tables[0].column_types for table in tables[1:]: if any(not isinstance(a, type(b)) for a, b in zip_longest(table.column_types, column_types)): raise ValueError( 'Only tables with identical column types may be merged.') rows = [] for table in tables: if table.column_names == column_names: rows.extend(table.rows) else: for row in table.rows: rows.append(Row(row.values(), column_names)) return Table(rows, column_names, column_types, row_names=row_names, _is_fork=True)
def test_all(self): rows = [ Row([True], ['test']), Row([True], ['test']), Row([None], ['test']), ] column = Column(0, 'test', Boolean(), rows) self.assertEqual(column.aggregate(All()), False) rows = [ Row([True], ['test']), Row([True], ['test']), Row([True], ['test']), ] column = Column(0, 'test', Boolean(), rows) self.assertEqual(column.aggregate(All()), True)
def merge(self): """ Convert this TableSet into a single table. This is the inverse of :meth:`.Table.group_by`. Any :code:`row_names` set on the merged tables will be lost in this process. :returns: A new :class:`Table`. """ column_names = list(self.column_names) column_types = list(self.column_types) column_names.insert(0, self.key_name) column_types.insert(0, self.key_type) rows = [] for key, table in self.items(): for row in table.rows: rows.append(Row((key,) + tuple(row), column_names)) return Table(rows, column_names, column_types)
def join(self, right_table, left_key, right_key=None, inner=False): """ Performs the equivalent of SQL's "left outer join", combining columns from this table and from :code:`right_table` anywhere that the output of :code:`left_key` and :code:`right_key` are equivalent. Where there is no match for :code:`left_key` the left columns will be included with the right columns set to :code:`None` unless the :code:`inner` argument is specified. (See arguments for more.) If :code:`left_key` and :code:`right_key` are column names, only the left column will be included in the output table. Column names from the right table which also exist in this table will be suffixed "2" in the new table. :param right_table: The "right" table to join to. :param left_key: Either the name of a column from the this table to join on, or a :class:`function` that takes a row and returns a value to join on. :param right_key: Either the name of a column from :code:table` to join on, or a :class:`function` that takes a row and returns a value to join on. If :code:`None` then :code:`left_key` will be used for both. :param inner: Perform a SQL-style "inner join" instead of a left outer join. Rows which have no match for :code:`left_key` will not be included in the output table. :returns: A new :class:`Table`. """ left_key_is_row_function = hasattr(left_key, '__call__') if right_key is None: right_key = left_key right_key_is_row_function = hasattr(right_key, '__call__') # Get join columns right_key_index = None if left_key_is_row_function: left_data = [left_key(row) for row in self.rows] else: left_data = self._columns[left_key].values() if right_key_is_row_function: right_data = [right_key(row) for row in right_table.rows] else: right_column = right_table.columns[right_key] right_data = right_column.values() right_key_index = right_table.columns._keys.index(right_key) # Build names and type lists column_names = list(self._column_names) column_types = list(self._column_types) for column in right_table.columns: name = column.name if name == right_key: continue if name in self._column_names: column_names.append('%s2' % name) else: column_names.append(name) column_types.append(column.data_type) right_hash = {} for i, value in enumerate(right_data): if value not in right_hash: right_hash[value] = [] right_hash[value].append(right_table._rows[i]) # Collect new rows rows = [] if self._row_names is not None: row_names = [] else: row_names = None # Iterate over left column for left_index, left_value in enumerate(left_data): matching_rows = right_hash.get(left_value, None) # Rows with matches if matching_rows: for right_row in matching_rows: new_row = list(self._rows[left_index]) for k, v in enumerate(right_row): if k == right_key_index: continue new_row.append(v) rows.append(Row(new_row, column_names)) if self._row_names is not None: row_names.append(self._row_names[left_index]) # Rows without matches elif not inner: new_row = list(self._rows[left_index]) for k, v in enumerate(right_table.column_names): if k == right_key_index: continue new_row.append(None) rows.append(Row(new_row, column_names)) if self._row_names is not None: row_names.append(self._row_names[left_index]) return self._fork(rows, column_names, column_types, row_names=row_names)
def join(self, right_table, left_key=None, right_key=None, inner=False, full_outer=False, require_match=False, columns=None): """ Create a new table by joining two table's on common values. This method implements most varieties of SQL join, in addition to some unique features. If :code:`left_key` and :code:`right_key` are both :code:`None` then this method will perform a "sequential join", which is to say it will join on row number. The :code:`inner` and :code:`full_outer` arguments will determine whether dangling left-hand and right-hand rows are included, respectively. If :code:`left_key` is specified, then a "left outer join" will be performed. This will combine columns from the :code:`right_table` anywhere that :code:`left_key` and :code:`right_key` are equal. Unmatched rows from the left table will be included with the right-hand columns set to :code:`None`. If :code:`inner` is :code:`True` then an "inner join" will be performed. Unmatched rows from either table will be left out. If :code:`full_outer` is :code:`True` then a "full outer join" will be performed. Unmatched rows from both tables will be included, with the columns in the other table set to :code:`None`. In all cases, if :code:`right_key` is :code:`None` then it :code:`left_key` will be used for both tables. If :code:`left_key` and :code:`right_key` are column names, the right-hand identifier column will not be included in the output table. If :code:`require_match` is :code:`True` unmatched rows will raise an exception. This is like an "inner join" except any row that doesn't have a match will raise an exception instead of being dropped. This is useful for enforcing expectations about datasets that should match. Column names from the right table which also exist in this table will be suffixed "2" in the new table. A subset of columns from the right-hand table can be included in the joined table using the :code:`columns` argument. :param right_table: The "right" table to join to. :param left_key: Either the name of a column from the this table to join on, the index of a column, a sequence of such column identifiers, a :class:`function` that takes a row and returns a value to join on, or :code:`None` in which case the tables will be joined on row number. :param right_key: Either the name of a column from :code:table` to join on, the index of a column, a sequence of such column identifiers, or a :class:`function` that takes a ow and returns a value to join on. If :code:`None` then :code:`left_key` will be used for both. If :code:`left_key` is :code:`None` then this value is ignored. :param inner: Perform a SQL-style "inner join" instead of a left outer join. Rows which have no match for :code:`left_key` will not be included in the output table. :param full_outer: Perform a SQL-style "full outer" join rather than a left or a right. May not be used in combination with :code:`inner`. :param require_match: If true, an exception will be raised if there is a left_key with no matching right_key. :param columns: A sequence of column names from :code:`right_table` to include in the final output table. Defaults to all columns not in :code:`right_key`. Ignored when :code:`full_outer` is :code:`True`. :returns: A new :class:`.Table`. """ if inner and full_outer: raise ValueError('A join can not be both "inner" and "full_outer".') if right_key is None: right_key = left_key # Get join columns right_key_indices = [] left_key_is_func = hasattr(left_key, '__call__') left_key_is_sequence = utils.issequence(left_key) # Left key is None if left_key is None: left_data = tuple(range(len(self._rows))) # Left key is a function elif left_key_is_func: left_data = [left_key(row) for row in self._rows] # Left key is a sequence elif left_key_is_sequence: left_columns = [self._columns[key] for key in left_key] left_data = zip(*[column.values() for column in left_columns]) # Left key is a column name/index else: left_data = self._columns[left_key].values() right_key_is_func = hasattr(right_key, '__call__') right_key_is_sequence = utils.issequence(right_key) # Sequential join if left_key is None: right_data = tuple(range(len(right_table._rows))) # Right key is a function elif right_key_is_func: right_data = [right_key(row) for row in right_table._rows] # Right key is a sequence elif right_key_is_sequence: right_columns = [right_table._columns[key] for key in right_key] right_data = zip(*[column.values() for column in right_columns]) right_key_indices = [right_table._columns._keys.index(key) for key in right_key] # Right key is a column name/index else: right_column = right_table._columns[right_key] right_data = right_column.values() right_key_indices = [right_table._columns.index(right_column)] # Build names and type lists column_names = list(self._column_names) column_types = list(self._column_types) for i, column in enumerate(right_table._columns): name = column.name if not full_outer: if columns is None and i in right_key_indices: continue if columns is not None and name not in columns: continue if name in self.column_names: column_names.append('%s2' % name) else: column_names.append(name) column_types.append(column.data_type) if columns is not None and not full_outer: right_table = right_table.select([n for n in right_table._column_names if n in columns]) right_hash = {} for i, value in enumerate(right_data): if value not in right_hash: right_hash[value] = [] right_hash[value].append(right_table._rows[i]) # Collect new rows rows = [] if self._row_names is not None and not full_outer: row_names = [] else: row_names = None # Iterate over left column for left_index, left_value in enumerate(left_data): matching_rows = right_hash.get(left_value, None) if require_match and matching_rows is None: raise ValueError('Left key "%s" does not have a matching right key.' % left_value) # Rows with matches if matching_rows: for right_row in matching_rows: new_row = list(self._rows[left_index]) for k, v in enumerate(right_row): if columns is None and k in right_key_indices and not full_outer: continue new_row.append(v) rows.append(Row(new_row, column_names)) if self._row_names is not None and not full_outer: row_names.append(self._row_names[left_index]) # Rows without matches elif not inner: new_row = list(self._rows[left_index]) for k, v in enumerate(right_table._column_names): if columns is None and k in right_key_indices and not full_outer: continue new_row.append(None) rows.append(Row(new_row, column_names)) if self._row_names is not None and not full_outer: row_names.append(self._row_names[left_index]) # Full outer join if full_outer: left_set = set(left_data) for right_index, right_value in enumerate(right_data): if right_value in left_set: continue new_row = ([None] * len(self._columns)) + list(right_table.rows[right_index]) rows.append(Row(new_row, column_names)) return self._fork(rows, column_names, column_types, row_names=row_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. ' 'Did you want agate.Table.from_csv?') # Validate column names if column_names: self._column_names = utils.deduplicate(column_names, column_names=True) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = tuple() len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in column_types.values(): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len_column_names - len_row)) row_values = [] for j, d in enumerate(row): try: row_values.append(cast_funcs[j](d)) except CastError as e: raise CastError( str(e) + ' Error at row %s column %s.' % (i, self._column_names[j])) new_rows.append(Row(row_values, self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') for row_name in computed_row_names: if type(row_name) is int: raise ValueError( 'Row names cannot be of type int. Use Decimal for numbered row names.' ) self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i in range(len_column_names): name = self._column_names[i] data_type = self._column_types[i] column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: final_column_names.append(utils.letter_name(i)) elif isinstance(column_name, six.string_types): final_column_names.append(column_name) else: raise ValueError('Column names must be strings or None.') if len(set(final_column_names)) != len(final_column_names): raise ValueError('Duplicate column names are not allowed.') self._column_names = tuple(final_column_names) else: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, TypeTester): pass else: for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append( Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif isinstance(row_names, Sequence): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate( zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def join(self, right_table, left_key, right_key=None, inner=False, require_match=False, columns=None): """ Create a new table by joining two table's on common values. This method performs the equivalent of SQL's "left outer join", combining columns from this table and from :code:`right_table` anywhere that the :code:`left_key` and :code:`right_key` are equivalent. Where there is no match for :code:`left_key` the left columns will be included with the right columns set to :code:`None` unless the :code:`inner` argument is specified. If :code:`left_key` and :code:`right_key` are column names, only the left columns will be included in the output table. Column names from the right table which also exist in this table will be suffixed "2" in the new table. :param right_table: The "right" table to join to. :param left_key: Either the name of a column from the this table to join on, a sequence of such column names, or a :class:`function` that takes a row and returns a value to join on. :param right_key: Either the name of a column from :code:table` to join on, a sequence of such column names, or a :class:`function` that takes a row and returns a value to join on. If :code:`None` then :code:`left_key` will be used for both. :param inner: Perform a SQL-style "inner join" instead of a left outer join. Rows which have no match for :code:`left_key` will not be included in the output table. :param require_match: If true, an exception will be raised if there is a left_key with no matching right_key. :param columns: A sequence of column names from :code:`right_table` to include in the final output table. Defaults to all columns not in :code:`right_key`. :returns: A new :class:`.Table`. """ if right_key is None: right_key = left_key # Get join columns right_key_indices = [] left_key_is_func = hasattr(left_key, '__call__') left_key_is_sequence = utils.issequence(left_key) # Left key is a function if left_key_is_func: left_data = [left_key(row) for row in self.rows] # Left key is a sequence elif left_key_is_sequence: left_columns = [self.columns[key] for key in left_key] left_data = zip(*[column.values() for column in left_columns]) # Left key is a column name/index else: left_data = self.columns[left_key].values() right_key_is_func = hasattr(right_key, '__call__') right_key_is_sequence = utils.issequence(right_key) # Right key is a function if right_key_is_func: right_data = [right_key(row) for row in right_table.rows] # Right key is a sequence elif right_key_is_sequence: right_columns = [right_table.columns[key] for key in right_key] right_data = zip(*[column.values() for column in right_columns]) right_key_indices = [ right_table.columns._keys.index(key) for key in right_key ] # Right key is a column name/index else: right_column = right_table.columns[right_key] right_data = right_column.values() right_key_indices = [right_table.columns._keys.index(right_key)] # Build names and type lists column_names = list(self.column_names) column_types = list(self.column_types) for i, column in enumerate(right_table.columns): name = column.name if columns is None and i in right_key_indices: continue if columns is not None and name not in columns: continue if name in self.column_names: column_names.append('%s2' % name) else: column_names.append(name) column_types.append(column.data_type) if columns is not None: right_table = right_table.select( [n for n in right_table.column_names if n in columns]) right_hash = {} for i, value in enumerate(right_data): if value not in right_hash: right_hash[value] = [] right_hash[value].append(right_table.rows[i]) # Collect new rows rows = [] if self.row_names is not None: row_names = [] else: row_names = None # Iterate over left column for left_index, left_value in enumerate(left_data): matching_rows = right_hash.get(left_value, None) if require_match and matching_rows is None: raise ValueError( 'Left key "%s" does not have a matching right key.' % left_value) # Rows with matches if matching_rows: for right_row in matching_rows: new_row = list(self.rows[left_index]) for k, v in enumerate(right_row): if columns is None and k in right_key_indices: continue new_row.append(v) rows.append(Row(new_row, column_names)) if self.row_names is not None: row_names.append(self.row_names[left_index]) # Rows without matches elif not inner: new_row = list(self.rows[left_index]) for k, v in enumerate(right_table.column_names): if columns is None and k in right_key_indices: continue new_row.append(None) rows.append(Row(new_row, column_names)) if self.row_names is not None: row_names.append(self.row_names[left_index]) return self._fork(rows, column_names, column_types, row_names=row_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( 'When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?' ) # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn( 'Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError('Column names must be strings or None.') final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + '_' + str( duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, dict): for v in six.itervalues(column_types): if not isinstance(v, DataType): raise ValueError( 'Column types must be instances of DataType.') column_types = TypeTester(force=column_types) elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append( Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate( zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None): """ Create a new table with columns converted into rows values. For example: +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ can be normalized on columns 'gender', 'race' and 'age': +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ This is the opposite of :meth:`.Table.denormalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized self. Typically these are the tables unique identifiers and any metadata about them. :param properties: A column name or a sequence of column names that should be converted to properties in the new self. :param property_column: The name to use for the column containing the property names. :param value_column: The name to use for the column containing the property values. :param column_types: A sequence of two column types for the property and value column in that order or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table new_rows = [] if not utils.issequence(key): key = [key] if not utils.issequence(properties): properties = [properties] new_column_names = key + [property_column, value_column] row_names = [] for row in self.rows: k = tuple(row[n] for n in key) left_row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in properties: new_rows.append( Row(tuple(left_row + [f, row[f]]), new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def homogenize(self, key, compare_values, default_row=None): """ Fill in missing rows in a series. This can be used, for instance, to add rows for missing years in a time series. Missing rows are found by comparing the values in the :code:`key` columns with those provided as :code:`compare_values`. Values not found in the table will be used to generate new rows with the given :code:`default_row`. :code:`default_row` should be an array of values or an array-generating function. If not specified, the new rows will have :code:`None` in columns all columns not specified in :code:`key`. If :code:`default_row` is an array of values, its length should be row length minus the number of column names provided in the :code:`key`. If it is an array-generating function, the function should take an array of missing values for each new row and output a full row including those values. :param key: Either a column name or a sequence of such names. :param compare_values: Either an array of column values if key is a single column name or a sequence of arrays of values if key is a sequence of names. It can also be a generator that yields either of the two. A row is created for each value or list of values not found in the rows of the table. :param default_row: An array of values or a function to generate new rows. The length of the input array should be equal to row length minus column_names count. The length of array generated by the function should be the row length. :returns: A new :class:`.Table`. """ rows = list(self._rows) if not utils.issequence(key): key = [key] if len(key) == 1: if any(not utils.issequence(compare_value) for compare_value in compare_values): compare_values = [[compare_value] for compare_value in compare_values] column_values = [self._columns.get(name) for name in key] column_indexes = [self._column_names.index(name) for name in key] column_values = zip(*column_values) differences = list(set(map(tuple, compare_values)) - set(column_values)) for difference in differences: if callable(default_row): rows.append(Row(default_row(difference), self._column_names)) else: if default_row is not None: new_row = default_row else: new_row = [None] * (len(self._column_names) - len(key)) for i, d in zip(column_indexes, difference): new_row.insert(i, d) rows.append(Row(new_row, self._column_names)) return self._fork(rows)
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None): """ Create a new table with row values converted into columns. For example: +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ Can be denormalized so that each unique value in `field` becomes a column with `value` used for its values. +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ If one or more keys are specified then the resulting table will automatically have :code:`row_names` set to those keys. This is the opposite of :meth:`.Table.normalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized table. Typically these are the tables unique identifiers and any metadata about them. Or, :code:`None` if there are no key columns. :param field_column: The column whose values should become column names in the new table. :param property_column: The column whose values should become the values of the property columns in the new table. :param default_value: Value to be used for missing values in the pivot table. If not specified :code:`Decimal(0)` will be used for aggregations that return :class:`.Number` data and :code:`None` will be used for all others. :param column_types: A sequence of column types with length equal to number of unique values in field_column or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table if key is None: key = [] elif not utils.issequence(key): key = [key] field_names = [] row_data = OrderedDict() for row in self.rows: row_key = tuple(row[k] for k in key) if row_key not in row_data: row_data[row_key] = OrderedDict() f = six.text_type(row[property_column]) v = row[value_column] if f not in field_names: field_names.append(f) row_data[row_key][f] = v if default_value == utils.default: if isinstance(self.columns[value_column].data_type, Number): default_value = Decimal(0) else: default_value = None new_column_names = key + field_names new_rows = [] row_names = [] for k, v in row_data.items(): row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in field_names: if f in v: row.append(v[f]) else: row.append(default_value) new_rows.append(Row(row, new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def __init__(self, rows, column_info, row_names=None, _is_fork=False): column_info = list(column_info) if isinstance(column_info[0], Column): self._column_names = tuple(c.name for c in column_info) self._column_types = tuple(c.data_type for c in column_info) else: column_names, self._column_types = zip(*column_info) self._column_names = [] # Validation for i, column_name in enumerate(column_names): if not column_name: self._column_names.append(letter_name(i)) else: if not isinstance(column_name, six.string_types): raise ValueError('Column names must be strings.') self._column_names.append(column_name) len_column_names = len(self._column_names) if len(set(self._column_names)) != len_column_names: raise ValueError('Duplicate column names are not allowed.') self._column_names = tuple(self._column_names) for column_type in self._column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif isinstance(row_names, Sequence): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)