def from_json(cls, path, keys=None, **kwargs): """ Create a new :class:`TableSet` from a directory of JSON files or a single JSON object with key value (Table key and list of row objects) pairs for each :class:`Table`. See :meth:`.Table.from_json` for additional details. :param path: Path to a directory containing JSON files or filepath/file-like object of nested JSON file. :param keys: A list of keys of the top-level dictionaries for each file. If specified, length must be equal to number of JSON files in path. """ if isinstance(path, six.string_types) and not os.path.isdir( path) and not os.path.isfile(path): raise IOError('Specified path doesn\'t exist.') tables = OrderedDict() if isinstance(path, six.string_types) and os.path.isdir(path): filepaths = glob(os.path.join(path, '*.json')) if keys is not None and len(keys) != len(filepaths): raise ValueError( 'If specified, keys must have length equal to number of JSON files' ) for i, filepath in enumerate(filepaths): name = os.path.split(filepath)[1].strip('.json') if keys is not None: tables[name] = Table.from_json(filepath, keys[i], **kwargs) else: tables[name] = Table.from_json(filepath, **kwargs) else: if hasattr(path, 'read'): js = json.load(path, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) else: with open(path, 'r') as f: js = json.load(f, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) for key, value in js.items(): output = StringIO(json.dumps(value)) tables[key] = Table.from_json(output) return TableSet(tables.values(), tables.keys())
def test_monkeypatch_shadow(self): before_table = Table([['blah'], ], ['foo'], [Text()]) Table.monkeypatch(TryPatchShadow) after_table = Table([['blah'], ], ['foo'], [Text()]) self.assertIsInstance(before_table.columns, MappedSequence) self.assertIsInstance(after_table.columns, MappedSequence) with self.assertRaises(AttributeError): after_table.foo == 'foo'
def test_monkeypatch_shadow(self): before_table = Table([['blah'], ], [('foo', Text())]) Table.monkeypatch(TryPatchShadow) after_table = Table([['blah'], ], [('foo', Text())]) self.assertIsInstance(before_table.columns, MappedSequence) self.assertIsInstance(after_table.columns, MappedSequence) with self.assertRaises(AttributeError): after_table.foo == 'foo'
def from_json(cls, path, column_names=None, column_types=None, keys=None, **kwargs): """ Create a new :class:`TableSet` from a directory of JSON files or a single JSON object with key value (Table key and list of row objects) pairs for each :class:`Table`. See :meth:`.Table.from_json` for additional details. :param path: Path to a directory containing JSON files or filepath/file-like object of nested JSON file. :param keys: A list of keys of the top-level dictionaries for each file. If specified, length must be equal to number of JSON files in path. :param column_types: See :meth:`Table.__init__`. """ from agate.tableset import TableSet if isinstance(path, six.string_types) and not os.path.isdir(path) and not os.path.isfile(path): raise IOError('Specified path doesn\'t exist.') tables = OrderedDict() if isinstance(path, six.string_types) and os.path.isdir(path): filepaths = glob(os.path.join(path, '*.json')) if keys is not None and len(keys) != len(filepaths): raise ValueError('If specified, keys must have length equal to number of JSON files') for i, filepath in enumerate(filepaths): name = os.path.split(filepath)[1].strip('.json') if keys is not None: tables[name] = Table.from_json(filepath, keys[i], column_types=column_types, **kwargs) else: tables[name] = Table.from_json(filepath, column_types=column_types, **kwargs) else: if hasattr(path, 'read'): js = json.load(path, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) else: with open(path, 'r') as f: js = json.load(f, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) for key, value in js.items(): tables[key] = Table.from_object(value, column_types=column_types, **kwargs) return TableSet(tables.values(), tables.keys())
def from_csv(cls, dir_path, column_names=None, column_types=None, row_names=None, header=True, **kwargs): """ Create a new :class:`TableSet` from a directory of CSVs. See :meth:`.Table.from_csv` for additional details. :param dir_path: Path to a directory full of CSV files. All CSV files in this directory will be loaded. :param column_names: See :meth:`Table.__init__`. :param column_types: See :meth:`Table.__init__`. :param row_names: See :meth:`Table.__init__`. :param header: See :meth:`Table.from_csv`. """ if not os.path.isdir(dir_path): raise IOError('Specified path doesn\'t exist or isn\'t a directory.') tables = OrderedDict() for path in glob(os.path.join(dir_path, '*.csv')): name = os.path.split(path)[1].strip('.csv') tables[name] = Table.from_csv(path, column_names, column_types, row_names=row_names, header=header, **kwargs) return TableSet(tables.values(), tables.keys())
def from_csv(cls, dir_path, column_info, header=True, **kwargs): """ Create a new :class:`TableSet` from a directory of CSVs. This method will use csvkit if it is available, otherwise it will use Python's builtin csv module. ``kwargs`` will be passed through to :meth:`csv.reader`. If you are using Python 2 and not using csvkit, this method is not unicode-safe. :param dir_path: Path to a directory full of CSV files. All CSV files in this directory will be loaded. :param column_info: See :class:`.Table` constructor. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ from agate.table import Table if not os.path.isdir(dir_path): raise IOError( 'Specified path doesn\'t exist or isn\'t a directory.') tables = OrderedDict() for path in glob(os.path.join(dir_path, '*.csv')): name = os.path.split(path)[1].strip('.csv') table = Table.from_csv(path, column_info, header=header, **kwargs) tables[name] = table return TableSet(tables)
def aggregate(self, aggregations): """ Aggregate data from the tables in this set by performing some set of column operations on the groups and coalescing the results into a new :class:`.Table`. :code:`aggregations` must be a sequence of tuples, where each has two parts: a :code:`new_column_name` and a :class:`.Aggregation` instance. The resulting table will have the keys from this :class:`TableSet` (and any nested TableSets) set as its :code:`row_names`. See :meth:`.Table.__init__` for more details. :param aggregations: A list of tuples in the format :code:`(new_column_name, aggregation)`. :returns: A new :class:`.Table`. """ column_names, column_types, output, row_name_columns = _aggregate( self, aggregations) if len(row_name_columns) == 1: row_names = row_name_columns[0] else: def row_names(r): return tuple(r[n] for n in row_name_columns) return Table(output, column_names, column_types, row_names=row_names)
def from_csv(cls, dir_path, column_info, row_names=None, header=True, **kwargs): """ Create a new :class:`TableSet` from a directory of CSVs. This method will use csvkit if it is available, otherwise it will use Python's builtin csv module. ``kwargs`` will be passed through to :meth:`csv.reader`. If you are using Python 2 and not using csvkit, this method is not unicode-safe. :param dir_path: Path to a directory full of CSV files. All CSV files in this directory will be loaded. :param column_info: A sequence of pairs of column names and types. The latter must be instances of :class:`.DataType`. Or, an instance of :class:`.TypeTester` to infer types. :param row_names: See :meth:`Table.__init__`. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ use_inference = isinstance(column_info, TypeTester) if use_inference and not header: raise ValueError( 'Can not apply TypeTester to a CSV without headers.') if not os.path.isdir(dir_path): raise IOError( 'Specified path doesn\'t exist or isn\'t a directory.') tables = OrderedDict() if use_inference: has_inferred_columns = False for path in glob(os.path.join(dir_path, '*.csv')): name = os.path.split(path)[1].strip('.csv') table = Table.from_csv(path, column_info, row_names=row_names, header=header, **kwargs) if use_inference and not has_inferred_columns: column_info = tuple(zip(table.column_names, table.column_types)) has_inferred_columns = True tables[name] = table return TableSet(tables.values(), tables.keys())
def merge(cls, tables, row_names=None, column_names=None): """ Create a new table from a sequence of similar tables. This method will not carry over row names from the merged tables, but new row names can be specified with the :code:`row_names` argument. It is possible to limit the columns included in the new :class:`.Table` with :code:`column_names` argument. For example, to only include columns from a specific table, set :code:`column_names` equal to :code:`table.column_names`. :param tables: An sequence of :class:`.Table` instances. :param row_names: See :class:`.Table` for the usage of this parameter. :param column_names: A sequence of column names to include in the new :class:`.Table`. If not specified, all distinct column names from `tables` are included. :returns: A new :class:`.Table`. """ from agate.table import Table new_columns = OrderedDict() for table in tables: for i in range(0, len(table.columns)): if column_names is None or table.column_names[i] in column_names: column_name = table.column_names[i] column_type = table.column_types[i] if column_name in new_columns: if not isinstance(column_type, type(new_columns[column_name])): raise DataTypeError('Tables contain columns with the same names, but different types.') else: new_columns[column_name] = column_type column_keys = new_columns.keys() column_types = new_columns.values() rows = [] for table in tables: # Performance optimization for identical table structures if table.column_names == column_keys and table.column_types == column_types: rows.extend(table.rows) else: for row in table.rows: data = [] for column_key in column_keys: data.append(row.get(column_key, None)) rows.append(Row(data, column_keys)) return Table(rows, column_keys, column_types, row_names=row_names, _is_fork=True)
def test_table_from_csv(self): import csvkit from agate import table table.csv = csvkit if six.PY2: table = Table.from_csv('examples/test.csv', self.tester, encoding='utf8') else: table = Table.from_csv('examples/test.csv', self.tester) self.assertSequenceEqual(table.get_column_names(), ['one', 'two', 'three']) self.assertSequenceEqual(tuple(map(type, table.get_column_types())), [Number, Number, Text]) self.assertEqual(len(table.columns), 3) self.assertSequenceEqual(table.rows[0], [1, 4, 'a']) self.assertSequenceEqual(table.rows[1], [2, 3, 'b']) self.assertSequenceEqual(table.rows[2], [None, 2, u'👍'])
def print_structure(self, output=sys.stdout, max_rows=None): """ Print this table's column names and types as a plain-text table. :param output: The output to print to. """ from agate.table import Table name_column = [n for n in self._column_names] type_column = [t.__class__.__name__ for t in self._column_types] rows = zip(name_column, type_column) column_names = ['column', 'data_type'] text = Text() column_types = [text, text] table = Table(rows, column_names, column_types) return table.print_table(output=output, max_column_width=None, max_rows=max_rows)
def print_structure(self, output=sys.stdout): """ Print this table's column names and types as a plain-text table. :param output: The output to print to. """ from agate.table import Table name_column = [n for n in self.column_names] type_column = [t.__class__.__name__ for t in self.column_types] rows = zip(name_column, type_column) column_names = ['column', 'data_type'] text = Text() column_types = [text, text] table = Table(rows, column_names, column_types) return table.print_table(output=output, max_column_width=None)
def test_table_from_csv(self): table = Table.from_csv('examples/test.csv', self.tester) self.assertSequenceEqual(table.column_names, ['one', 'two', 'three']) self.assertSequenceEqual(tuple(map(type, table.column_types)), [Number, Number, Text]) self.assertEqual(len(table.columns), 3) self.assertSequenceEqual(table.rows[0], [1, 4, 'a']) self.assertSequenceEqual(table.rows[1], [2, 3, 'b']) self.assertSequenceEqual(table.rows[2], [None, 2, u'👍'])
def test_table_from_csv(self): table = Table.from_csv('examples/test.csv', column_types=self.tester) self.assertSequenceEqual(table.column_names, ['one', 'two', 'three']) self.assertSequenceEqual(tuple(map(type, table.column_types)), [Number, Number, Text]) self.assertEqual(len(table.columns), 3) self.assertSequenceEqual(table.rows[0], [1, 4, 'a']) self.assertSequenceEqual(table.rows[1], [2, 3, 'b']) self.assertSequenceEqual(table.rows[2], [None, 2, u'👍'])
def from_json(cls, path, row_names=None, key=None, newline=False, column_types=None, **kwargs): """ Create a new table from a JSON file. Once the JSON has been deseralized, the resulting Python object is passed to :meth:`.Table.from_object`. If the file contains a top-level dictionary you may specify what property contains the row list using the :code:`key` parameter. :code:`kwargs` will be passed through to :meth:`json.load`. :param path: Filepath or file-like object from which to read JSON data. :param row_names: See the :meth:`.Table.__init__`. :param key: The key of the top-level dictionary that contains a list of row arrays. :param newline: If `True` then the file will be parsed as "newline-delimited JSON". :param column_types: See :meth:`.Table.__init__`. """ from agate.table import Table if key is not None and newline: raise ValueError('key and newline may not be specified together.') if newline: js = [] if hasattr(path, 'read'): for line in path: js.append(json.loads(line, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs)) else: with open(path, 'r') as f: for line in f: js.append(json.loads(line, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs)) else: if hasattr(path, 'read'): js = json.load(path, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) else: with open(path, 'r') as f: js = json.load(f, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) if isinstance(js, dict): if not key: raise TypeError('When converting a JSON document with a top-level dictionary element, a key must be specified.') js = js[key] return Table.from_object(js, row_names=row_names, column_types=column_types)
def test_table_from_csv(self): import csvkit from agate import table table.csv = csvkit if six.PY2: table = Table.from_csv('examples/test.csv', self.tester, encoding='utf8') else: table = Table.from_csv('examples/test.csv', self.tester) self.assertSequenceEqual(table.column_names, ['one', 'two', 'three']) self.assertSequenceEqual(tuple(map(type, table.column_types)), [Number, Number, Text]) self.assertEqual(len(table.columns), 3) self.assertSequenceEqual(table.rows[0], [1, 4, 'a']) self.assertSequenceEqual(table.rows[1], [2, 3, 'b']) self.assertSequenceEqual(table.rows[2], [None, 2, u'👍'])
def rename(self, column_names=None, row_names=None, slug_columns=False, slug_rows=False, **kwargs): """ Create a copy of this table with different column names or row names. By enabling :code:`slug_columns` or :code:`slug_rows` and not specifying new names you may slugify the table's existing names. :code:`kwargs` will be passed through to `awesome-slugify's <https://github.com/dimka665/awesome-slugify>`_ :code:`UniqueSlugify` class. :param column_names: New column names for the renamed table. May be either an array or a dictionary mapping existing column names to new names. If not specified, will use this table's existing column names. :param row_names: New row names for the renamed table. May be either an array or a dictionary mapping existing row names to new names. If not specified, will use this table's existing row names. :param slug_columns: If True, column names will be converted to slugs and duplicate names will have unique identifiers appended. :param slug_rows: If True, row names will be converted to slugs and dupicate names will have unique identifiers appended. """ from agate.table import Table if isinstance(column_names, dict): column_names = [column_names[name] if name in column_names else name for name in self._column_names] if isinstance(row_names, dict): row_names = [row_names[name] if name in row_names else name for name in self._row_names] if slug_columns: column_names = column_names or self._column_names if column_names is not None: column_names = utils.slugify(column_names, ensure_unique=True, **kwargs) if slug_rows: row_names = row_names or self.row_names if row_names is not None: row_names = utils.slugify(row_names, ensure_unique=True, **kwargs) if column_names is not None and column_names != self._column_names: if row_names is None: row_names = self._row_names return Table(self._rows, column_names, self._column_types, row_names=row_names, _is_fork=False) else: return self._fork(self._rows, column_names, self._column_types, row_names=row_names)
def test_monkeypatch(self): before_table = Table([], ['foo'], [Text()]) Table.monkeypatch(TryPatch) after_table = Table([], ['foo'], [Text()]) self.assertSequenceEqual(Table.__bases__, [Patchable, TryPatch]) self.assertIsNotNone(getattr(before_table, 'test')) self.assertIsNotNone(getattr(before_table, 'testcls')) self.assertIsNotNone(getattr(after_table, 'test')) self.assertIsNotNone(getattr(after_table, 'testcls')) self.assertEqual(before_table.test(5), 5) self.assertEqual(after_table.test(5), 5) self.assertEqual(Table.testcls(5), 5)
def print_structure(self, max_rows=20, output=sys.stdout): """ Print the keys and row counts of each table in the tableset. :param max_rows: The maximum number of rows to display before truncating the data. Defaults to 20. :param output: The output used to print the structure of the :class:`Table`. :returns: None """ max_length = min(len(self.items()), max_rows) name_column = self.keys()[0:max_length] type_column = [str(len(table.rows)) for key, table in self.items()[0:max_length]] rows = zip(name_column, type_column) column_names = ['table', 'rows'] text = Text() column_types = [text, text] table = Table(rows, column_names, column_types) return table.print_table(output=output, max_column_width=None)
def from_csv(cls, dir_path, column_info, header=True, **kwargs): """ Create a new :class:`TableSet` from a directory of CSVs. This method will use csvkit if it is available, otherwise it will use Python's builtin csv module. ``kwargs`` will be passed through to :meth:`csv.reader`. If you are using Python 2 and not using csvkit, this method is not unicode-safe. :param dir_path: Path to a directory full of CSV files. All CSV files in this directory will be loaded. :param column_info: A sequence of pairs of column names and types. The latter must be instances of :class:`.DataType`. Or, an instance of :class:`.TypeTester` to infer types. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ from agate.table import Table use_inference = isinstance(column_info, TypeTester) if use_inference and not header: raise ValueError('Can not apply TypeTester to a CSV without headers.') if not os.path.isdir(dir_path): raise IOError('Specified path doesn\'t exist or isn\'t a directory.') tables = OrderedDict() if use_inference: has_inferred_columns = False for path in glob(os.path.join(dir_path, '*.csv')): name = os.path.split(path)[1].strip('.csv') table = Table.from_csv(path, column_info, header=header, **kwargs) if use_inference and not has_inferred_columns: column_info = tuple(zip(table.get_column_names(), table.get_column_types())) has_inferred_columns = True tables[name] = table return TableSet(tables)
def merge(self, groups=None, group_name=None, group_type=None): """ Convert this TableSet into a single table. This is the inverse of :meth:`.Table.group_by`. Any `row_names` set on the merged tables will be lost in this process. :param groups: A list of grouping factors to add to merged rows in a new column. If specified, it should have exactly one element per :class:`Table` in the :class:`TableSet`. If not specified or None, the grouping factor will be the name of the :class:`Row`'s original Table. :param group_name: This will be the column name of the grouping factors. If None, defaults to the :attr:`TableSet.key_name`. :param group_type: This will be the column type of the grouping factors. If None, defaults to the :attr:`TableSet.key_type`. :returns: A new :class:`Table`. """ if type(groups) is not list and groups is not None: raise ValueError('Groups must be None or a list.') if type(groups) is list and len(groups) != len(self): raise ValueError('Groups length must be equal to TableSet length.') column_names = list(self.column_names) column_types = list(self.column_types) column_names.insert(0, group_name if group_name else self.key_name) column_types.insert(0, group_type if group_type else self.key_type) rows = [] for index, (key, table) in enumerate(self.items()): for row in table.rows: if groups is None: rows.append(Row((key, ) + tuple(row), column_names)) else: rows.append( Row((groups[index], ) + tuple(row), column_names)) return Table(rows, column_names, column_types)
def from_csv(cls, dir_path, column_info, row_names=None, header=True, **kwargs): """ Create a new :class:`TableSet` from a directory of CSVs. See :meth:`.Table.from_csv` for additional details. :param dir_path: Path to a directory full of CSV files. All CSV files in this directory will be loaded. :param column_info: A sequence of pairs of column names and types. The latter must be instances of :class:`.DataType`. Or, an instance of :class:`.TypeTester` to infer types. :param row_names: See :meth:`Table.__init__`. :param header: If `True`, the first row of the CSV is assumed to contains headers and will be skipped. """ use_inference = isinstance(column_info, TypeTester) if use_inference and not header: raise ValueError('Can not apply TypeTester to a CSV without headers.') if not os.path.isdir(dir_path): raise IOError('Specified path doesn\'t exist or isn\'t a directory.') tables = OrderedDict() if use_inference: has_inferred_columns = False for path in glob(os.path.join(dir_path, '*.csv')): name = os.path.split(path)[1].strip('.csv') table = Table.from_csv(path, column_info, row_names=row_names, header=header, **kwargs) if use_inference and not has_inferred_columns: column_info = tuple(zip(table.column_names, table.column_types)) has_inferred_columns = True tables[name] = table return TableSet(tables.values(), tables.keys())
def test_monkeypatch(self): before_table = Table([], [('foo', Text())]) Table.monkeypatch(TryPatch) after_table = Table([], [('foo', Text())]) self.assertSequenceEqual(Table.__bases__, [Patchable, TryPatch]) self.assertIsNotNone(getattr(before_table, 'test')) self.assertIsNotNone(getattr(before_table, 'testcls')) self.assertIsNotNone(getattr(after_table, 'test')) self.assertIsNotNone(getattr(after_table, 'testcls')) self.assertEqual(before_table.test(5), 5) self.assertEqual(after_table.test(5), 5) self.assertEqual(Table.testcls(5), 5)
def merge(self): """ Convert this TableSet into a single table. This is the inverse of :meth:`.Table.group_by`. Any :code:`row_names` set on the merged tables will be lost in this process. :returns: A new :class:`Table`. """ column_names = list(self.column_names) column_types = list(self.column_types) column_names.insert(0, self.key_name) column_types.insert(0, self.key_type) rows = [] for key, table in self.items(): for row in table.rows: rows.append(Row((key,) + tuple(row), column_names)) return Table(rows, column_names, column_types)
def from_object(cls, obj, row_names=None, column_types=None): """ Create a new table from a Python object. The object should be a list containing a dictionary for each "row". Nested objects or lists will also be parsed. For example, this object: .. code-block:: python { 'one': { 'a': 1, 'b': 2, 'c': 3 }, 'two': [4, 5, 6], 'three': 'd' } Would generate these columns and values: .. code-block:: python { 'one/a': 1, 'one/b': 2, 'one/c': 3, 'two.0': 4, 'two.1': 5, 'two.2': 6, 'three': 'd' } Column names and types will be inferred from the data. Not all rows are required to have the same keys. Missing elements will be filled in with null values. :param obj: Filepath or file-like object from which to read JSON data. :param row_names: See :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. """ from agate.table import Table column_names = [] row_objects = [] for sub in obj: parsed = utils.parse_object(sub) for key in parsed.keys(): if key not in column_names: column_names.append(key) row_objects.append(parsed) rows = [] for sub in row_objects: r = [] for name in column_names: r.append(sub.get(name, None)) rows.append(r) return Table(rows, column_names, row_names=row_names, column_types=column_types)
def test_monkeypatch_deprecated(self): with warnings.catch_warnings(): warnings.simplefilter('error') with self.assertRaises(DeprecationWarning): Table.monkeypatch(TryPatch)
def test_monkeypatch_double(self): Table.monkeypatch(TryPatch) Table.monkeypatch(TryPatch) Table.monkeypatch(TryPatch) self.assertSequenceEqual(Table.__bases__, [Patchable, TryPatch])
def denormalize(self, key=None, property_column='property', value_column='value', default_value=utils.default, column_types=None): """ Create a new table with row values converted into columns. For example: +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ Can be denormalized so that each unique value in `field` becomes a column with `value` used for its values. +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ If one or more keys are specified then the resulting table will automatically have :code:`row_names` set to those keys. This is the opposite of :meth:`.Table.normalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized table. Typically these are the tables unique identifiers and any metadata about them. Or, :code:`None` if there are no key columns. :param field_column: The column whose values should become column names in the new table. :param property_column: The column whose values should become the values of the property columns in the new table. :param default_value: Value to be used for missing values in the pivot table. If not specified :code:`Decimal(0)` will be used for aggregations that return :class:`.Number` data and :code:`None` will be used for all others. :param column_types: A sequence of column types with length equal to number of unique values in field_column or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table if key is None: key = [] elif not utils.issequence(key): key = [key] field_names = [] row_data = OrderedDict() for row in self.rows: row_key = tuple(row[k] for k in key) if row_key not in row_data: row_data[row_key] = OrderedDict() f = six.text_type(row[property_column]) v = row[value_column] if f not in field_names: field_names.append(f) row_data[row_key][f] = v if default_value == utils.default: if isinstance(self.columns[value_column].data_type, Number): default_value = Decimal(0) else: default_value = None new_column_names = key + field_names new_rows = [] row_names = [] for k, v in row_data.items(): row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in field_names: if f in v: row.append(v[f]) else: row.append(default_value) new_rows.append(Row(row, new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def normalize(self, key, properties, property_column='property', value_column='value', column_types=None): """ Create a new table with columns converted into rows values. For example: +---------+----------+--------+-------+ | name | gender | race | age | +=========+==========+========+=======+ | Jane | female | black | 24 | +---------+----------+--------+-------+ | Jack | male | white | 35 | +---------+----------+--------+-------+ | Joe | male | black | 28 | +---------+----------+--------+-------+ can be normalized on columns 'gender', 'race' and 'age': +---------+-----------+---------+ | name | property | value | +=========+===========+=========+ | Jane | gender | female | +---------+-----------+---------+ | Jane | race | black | +---------+-----------+---------+ | Jane | age | 24 | +---------+-----------+---------+ | ... | ... | ... | +---------+-----------+---------+ This is the opposite of :meth:`.Table.denormalize`. :param key: A column name or a sequence of column names that should be maintained as they are in the normalized self. Typically these are the tables unique identifiers and any metadata about them. :param properties: A column name or a sequence of column names that should be converted to properties in the new self. :param property_column: The name to use for the column containing the property names. :param value_column: The name to use for the column containing the property values. :param column_types: A sequence of two column types for the property and value column in that order or an instance of :class:`.TypeTester`. Defaults to a generic :class:`.TypeTester`. :returns: A new :class:`.Table`. """ from agate.table import Table new_rows = [] if not utils.issequence(key): key = [key] if not utils.issequence(properties): properties = [properties] new_column_names = key + [property_column, value_column] row_names = [] for row in self.rows: k = tuple(row[n] for n in key) left_row = list(k) if len(k) == 1: row_names.append(k[0]) else: row_names.append(k) for f in properties: new_rows.append( Row(tuple(left_row + [f, row[f]]), new_column_names)) key_column_types = [ self.column_types[self.column_names.index(name)] for name in key ] if column_types is None or isinstance(column_types, TypeTester): tester = TypeTester() if column_types is None else column_types force_update = dict(zip(key, key_column_types)) force_update.update(tester._force) tester._force = force_update new_column_types = tester.run(new_rows, new_column_names) else: new_column_types = key_column_types + list(column_types) return Table(new_rows, new_column_names, new_column_types, row_names=row_names)
def from_json(cls, path, row_names=None, key=None, newline=False, column_types=None, encoding='utf-8', **kwargs): """ Create a new table from a JSON file. Once the JSON has been deseralized, the resulting Python object is passed to :meth:`.Table.from_object`. If the file contains a top-level dictionary you may specify what property contains the row list using the :code:`key` parameter. :code:`kwargs` will be passed through to :meth:`json.load`. :param path: Filepath or file-like object from which to read JSON data. :param row_names: See the :meth:`.Table.__init__`. :param key: The key of the top-level dictionary that contains a list of row arrays. :param newline: If `True` then the file will be parsed as "newline-delimited JSON". :param column_types: See :meth:`.Table.__init__`. :param encoding: According to RFC4627, JSON text shall be encoded in Unicode; the default encoding is UTF-8. You can override this by using any encoding supported by your Python's open() function if :code:`path` is a filepath. If passing in a file handle, it is assumed you have already opened it with the correct encoding specified. """ from agate.table import Table if key is not None and newline: raise ValueError('key and newline may not be specified together.') close = False try: if newline: js = [] if hasattr(path, 'read'): for line in path: js.append( json.loads(line, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs)) else: f = io.open(path, encoding=encoding) close = True for line in f: js.append( json.loads(line, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs)) else: if hasattr(path, 'read'): js = json.load(path, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) else: f = io.open(path, encoding=encoding) close = True js = json.load(f, object_pairs_hook=OrderedDict, parse_float=Decimal, **kwargs) if isinstance(js, dict): if not key: raise TypeError( 'When converting a JSON document with a top-level dictionary element, a key must be specified.' ) js = js[key] finally: if close: f.close() return Table.from_object(js, row_names=row_names, column_types=column_types)
def from_fixed(cls, path, schema_path, column_names=utils.default, column_types=None, row_names=None, encoding='utf-8', schema_encoding='utf-8'): """ Create a new table from a fixed-width file and a CSV schema. Schemas must be in the "ffs" format. There is a repository of such schemas maintained at `wireservice/ffs <https://github.com/wireservice/ffs>`_. :param path: File path or file-like object from which to read fixed-width data. :param schema_path: File path or file-like object from which to read schema (CSV) data. :param column_names: By default, these will be parsed from the schema. For alternatives, see :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. :param row_names: See :meth:`.Table.__init__`. :param encoding: Character encoding of the fixed-width file. Note: if passing in a file handle it is assumed you have already opened it with the correct encoding specified. :param schema_encoding: Character encoding of the schema file. Note: if passing in a file handle it is assumed you have already opened it with the correct encoding specified. """ from agate.table import Table close_f = False if not hasattr(path, 'read'): f = io.open(path, encoding=encoding) close_f = True else: f = path close_schema_f = False if not hasattr(schema_path, 'read'): schema_f = io.open(schema_path, encoding=schema_encoding) close_schema_f = True else: schema_f = path reader = fixed.reader(f, schema_f) rows = list(reader) if close_f: f.close() if close_schema_f: schema_f.close() if column_names == utils.default: column_names = reader.fieldnames return Table(rows, column_names, column_types, row_names=row_names)
def from_csv(cls, path, column_names=None, column_types=None, row_names=None, skip_lines=0, header=True, sniff_limit=0, encoding='utf-8', row_limit=None, **kwargs): """ Create a new table from a CSV. This method uses agate's builtin CSV reader, which supplies encoding support for both Python 2 and Python 3. :code:`kwargs` will be passed through to the CSV reader. :param path: Filepath or file-like object from which to read CSV data. If a file-like object is specified, it must be seekable. If using Python 2, the file should be opened in binary mode (`rb`). :param column_names: See :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. :param row_names: See :meth:`.Table.__init__`. :param skip_lines: The number of lines to skip from the top of the file. :param header: If :code:`True`, the first row of the CSV is assumed to contain column names. If :code:`header` and :code:`column_names` are both specified then a row will be skipped, but :code:`column_names` will be used. :param sniff_limit: Limit CSV dialect sniffing to the specified number of bytes. Set to None to sniff the entire file. Defaults to 0 (no sniffing). :param encoding: Character encoding of the CSV file. Note: if passing in a file handle it is assumed you have already opened it with the correct encoding specified. :param row_limit: Limit how many rows of data will be read. """ from agate import csv from agate.table import Table close = False try: if hasattr(path, 'read'): f = path else: if six.PY2: f = open(path, 'Urb') else: f = io.open(path, encoding=encoding) close = True if isinstance(skip_lines, int): while skip_lines > 0: f.readline() skip_lines -= 1 else: raise ValueError('skip_lines argument must be an int') contents = six.StringIO(f.read()) if sniff_limit is None: kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue()) elif sniff_limit > 0: kwargs['dialect'] = csv.Sniffer().sniff(contents.getvalue()[:sniff_limit]) if six.PY2: kwargs['encoding'] = encoding reader = csv.reader(contents, header=header, **kwargs) if header: if column_names is None: column_names = next(reader) else: next(reader) if row_limit is None: rows = tuple(reader) else: rows = tuple(itertools.islice(reader, row_limit)) finally: if close: f.close() return Table(rows, column_names, column_types, row_names=row_names)