def from_csv(cls, path, column_info=None, row_names=None, header=True, **kwargs): """ Create a new table for a CSV. This method will use csvkit if it is available, otherwise it will use Python's builtin csv module. ``kwargs`` will be passed through to :meth:`csv.reader`. If you are using Python 2 and not using csvkit, this method is not unicode-safe. :param path: Filepath or file-like object from which to read CSV data. :param column_info: May be any valid input to :meth:`Table.__init__` or an instance of :class:`.TypeTester`. Or, None, in which case a generic :class:`.TypeTester` will be created. :param row_names: See :meth:`Table.__init__`. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ if column_info is None: column_info = TypeTester() use_inference = isinstance(column_info, TypeTester) if hasattr(path, 'read'): rows = list(csv.reader(path, **kwargs)) else: with open(path) as f: rows = list(csv.reader(f, **kwargs)) if header: column_names = rows.pop(0) else: column_names = [None] * len(rows[0]) if use_inference: column_info = column_info.run(rows, column_names) else: if len(column_names) != len(column_info): # TKTK Better Error raise ValueError('CSV contains more columns than were specified.') return Table(rows, column_info, row_names=row_names)
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None): """ Denormalize a dataset so that unique values in a column become their own columns. For example: +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ Can be denormalized so that each unique value in `field` becomes a column with `value` used for its values. +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ If one or more keys are specified then the resulting table will automatically have `row_names` set to those keys. This is the opposite of :meth:`Table.normalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized table. Typically these are the tables unique identifiers and any metadata about them. Or, :code:`None` if there are no key columns. :param field_column: The column whose values should become column names in the new table. :param property_column: The column whose values should become the values of the property columns in the new table. :param default_value: Value to be used for missing values in the pivot table. If not specified :code:`Decimal(0)` will be used for aggregations that return :class:`.Number` data and :code:`None` will be used for all others. :param column_types: A sequence of column types with length equal to number of unique values in field_column or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`Table`. """ from agate.table import Table if key is None: key = [] elif not utils.issequence(key): key = [key] field_names = [] row_data = OrderedDict() for row in self.rows: row_key = tuple(row[k] for k in key) if row_key not in row_data: row_data[row_key] = OrderedDict() f = six.text_type(row[property_column]) v = row[value_column] if f not in field_names: field_names.append(f) row_data[row_key][f] = v if default_value == utils.default: if isinstance(self.columns[value_column].data_type, Number): default_value = Decimal(0) else: default_value = None new_column_names = key + field_names new_rows = [] row_names = [] for k, v in row_data.items(): row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in field_names: if f in v: row.append(v[f]) else: row.append(default_value) new_rows.append(Row(row, new_column_names)) key_column_types = [self.column_types[self.column_names.index(name)] for name in key] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): if isinstance(rows, six.string_types): raise ValueError( "When created directly, the first argument to Table must be a sequence of rows. Did you want agate.Table.from_csv?" ) # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: new_column_name = utils.letter_name(i) warnings.warn( 'Column name not specified. "%s" will be used as name.' % new_column_name, RuntimeWarning ) elif isinstance(column_name, six.string_types): new_column_name = column_name else: raise ValueError("Column names must be strings or None.") final_column_name = new_column_name duplicates = 0 while final_column_name in final_column_names: final_column_name = new_column_name + "_" + str(duplicates + 2) duplicates += 1 if duplicates > 0: warn_duplicate_column(new_column_name, final_column_name) final_column_names.append(final_column_name) self._column_names = tuple(final_column_names) elif rows: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) warnings.warn( 'Column names not specified. "%s" will be used as names.' % str(self._column_names), RuntimeWarning, stacklevel=2, ) else: self._column_names = [] len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif not isinstance(column_types, TypeTester): for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError("Column types must be instances of DataType.") if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError("column_names and column_types must be the same length.") if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( "Row %i has %i values, but Table only has %i columns." % (i, len_row, len_column_names) ) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, "__call__"): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif utils.issequence(row_names): computed_row_names = row_names else: raise ValueError("row_names must be a column name, function or sequence") self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): # Validate column names if column_names: final_column_names = [] for i, column_name in enumerate(column_names): if column_name is None: final_column_names.append(utils.letter_name(i)) elif isinstance(column_name, six.string_types): final_column_names.append(column_name) else: raise ValueError('Column names must be strings or None.') if len(set(final_column_names)) != len(final_column_names): raise ValueError('Duplicate column names are not allowed.') self._column_names = tuple(final_column_names) else: self._column_names = tuple( utils.letter_name(i) for i in range(len(rows[0]))) len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, TypeTester): pass else: for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError( 'Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError( 'column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError( 'Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append( Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif isinstance(row_names, Sequence): computed_row_names = row_names else: raise ValueError( 'row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate( zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def __init__(self, rows, column_names=None, column_types=None, row_names=None, _is_fork=False): # Validate column names if column_names: for column_name in column_names: if not isinstance(column_name, six.string_types): raise ValueError('Column names must be strings.') if len(set(column_names)) != len(column_names): raise ValueError('Duplicate column names are not allowed.') self._column_names = tuple(column_names) else: self._column_names = tuple(utils.letter_name(i) for i in range(len(rows[0]))) len_column_names = len(self._column_names) # Validate column_types if column_types is None: column_types = TypeTester() elif isinstance(column_types, TypeTester): pass else: for column_type in column_types: if not isinstance(column_type, DataType): raise ValueError('Column types must be instances of DataType.') if isinstance(column_types, TypeTester): self._column_types = column_types.run(rows, self._column_names) else: self._column_types = tuple(column_types) if len_column_names != len(self._column_types): raise ValueError('column_names and column_types must be the same length.') if not _is_fork: new_rows = [] cast_funcs = [c.cast for c in self._column_types] for i, row in enumerate(rows): len_row = len(row) if len_row > len_column_names: raise ValueError('Row %i has %i values, but Table only has %i columns.' % (i, len_row, len_column_names)) elif len(row) < len_column_names: row = chain(row, [None] * (len(self.column_names) - len_row)) new_rows.append(Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)), self._column_names)) else: new_rows = rows if row_names: computed_row_names = [] if isinstance(row_names, six.string_types): for row in new_rows: name = row[row_names] computed_row_names.append(name) elif hasattr(row_names, '__call__'): for row in new_rows: name = row_names(row) computed_row_names.append(name) elif isinstance(row_names, Sequence): computed_row_names = row_names else: raise ValueError('row_names must be a column name, function or sequence') self._row_names = tuple(computed_row_names) else: self._row_names = None self._rows = MappedSequence(new_rows, self._row_names) # Build columns new_columns = [] for i, (name, data_type) in enumerate(zip(self._column_names, self._column_types)): column = Column(i, name, data_type, self._rows, row_names=self._row_names) new_columns.append(column) self._columns = MappedSequence(new_columns, self._column_names)
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None): """ Normalize a sequence of columns into two columns for field and value. For example: +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ can be normalized on columns 'gender', 'race' and 'age': +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ This is the opposite of :meth:`Table.denormalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized self. Typically these are the tables unique identifiers and any metadata about them. :param properties: A column name or a sequence of column names that should be converted to properties in the new self. :param property_column: The name to use for the column containing the property names. :param value_column: The name to use for the column containing the property values. :param column_types: A sequence of two column types for the property and value column in that order or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`Table`. """ from agate.table import Table new_rows = [] if not utils.issequence(key): key = [key] if not utils.issequence(properties): properties = [properties] new_column_names = key + [property_column, value_column] row_names = [] for row in self.rows: k = tuple(row[n] for n in key) left_row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in properties: new_rows.append(Row(tuple(left_row + [f, row[f]]), new_column_names)) key_column_types = [self.column_types[self.column_names.index(name)] for name in key] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)