Example #1
0
    def from_csv(cls, path, column_info=None, row_names=None, header=True, **kwargs):
        """
        Create a new table for a CSV. This method will use csvkit if it is
        available, otherwise it will use Python's builtin csv module.

        ``kwargs`` will be passed through to :meth:`csv.reader`.

        If you are using Python 2 and not using csvkit, this method is not
        unicode-safe.

        :param path:
            Filepath or file-like object from which to read CSV data.
        :param column_info:
            May be any valid input to :meth:`Table.__init__` or
            an instance of :class:`.TypeTester`. Or, None, in which case a
            generic :class:`.TypeTester` will be created.
        :param row_names:
            See :meth:`Table.__init__`.
        :param header:
            If `True`, the first row of the CSV is assumed to contains
            headers and will be skipped.
        """
        if column_info is None:
            column_info = TypeTester()

        use_inference = isinstance(column_info, TypeTester)

        if hasattr(path, 'read'):
            rows = list(csv.reader(path, **kwargs))
        else:
            with open(path) as f:
                rows = list(csv.reader(f, **kwargs))

        if header:
            column_names = rows.pop(0)
        else:
            column_names = [None] * len(rows[0])

        if use_inference:
            column_info = column_info.run(rows, column_names)
        else:
            if len(column_names) != len(column_info):
                # TKTK Better Error
                raise ValueError('CSV contains more columns than were specified.')

        return Table(rows, column_info, row_names=row_names)
Example #2
0
    def __init__(self,
                 rows,
                 column_names=None,
                 column_types=None,
                 row_names=None,
                 _is_fork=False):
        # Validate column names
        if column_names:
            final_column_names = []

            for i, column_name in enumerate(column_names):
                if column_name is None:
                    final_column_names.append(utils.letter_name(i))
                elif isinstance(column_name, six.string_types):
                    final_column_names.append(column_name)
                else:
                    raise ValueError('Column names must be strings or None.')

            if len(set(final_column_names)) != len(final_column_names):
                raise ValueError('Duplicate column names are not allowed.')

            self._column_names = tuple(final_column_names)
        else:
            self._column_names = tuple(
                utils.letter_name(i) for i in range(len(rows[0])))

        len_column_names = len(self._column_names)

        # Validate column_types
        if column_types is None:
            column_types = TypeTester()
        elif isinstance(column_types, TypeTester):
            pass
        else:
            for column_type in column_types:
                if not isinstance(column_type, DataType):
                    raise ValueError(
                        'Column types must be instances of DataType.')

        if isinstance(column_types, TypeTester):
            self._column_types = column_types.run(rows, self._column_names)
        else:
            self._column_types = tuple(column_types)

        if len_column_names != len(self._column_types):
            raise ValueError(
                'column_names and column_types must be the same length.')

        if not _is_fork:
            new_rows = []
            cast_funcs = [c.cast for c in self._column_types]

            for i, row in enumerate(rows):
                len_row = len(row)

                if len_row > len_column_names:
                    raise ValueError(
                        'Row %i has %i values, but Table only has %i columns.'
                        % (i, len_row, len_column_names))
                elif len(row) < len_column_names:
                    row = chain(row,
                                [None] * (len(self.column_names) - len_row))

                new_rows.append(
                    Row(tuple(cast_funcs[i](d) for i, d in enumerate(row)),
                        self._column_names))
        else:
            new_rows = rows

        if row_names:
            computed_row_names = []

            if isinstance(row_names, six.string_types):
                for row in new_rows:
                    name = row[row_names]
                    computed_row_names.append(name)
            elif hasattr(row_names, '__call__'):
                for row in new_rows:
                    name = row_names(row)
                    computed_row_names.append(name)
            elif isinstance(row_names, Sequence):
                computed_row_names = row_names
            else:
                raise ValueError(
                    'row_names must be a column name, function or sequence')

            self._row_names = tuple(computed_row_names)
        else:
            self._row_names = None

        self._rows = MappedSequence(new_rows, self._row_names)

        # Build columns
        new_columns = []

        for i, (name, data_type) in enumerate(
                zip(self._column_names, self._column_types)):
            column = Column(i,
                            name,
                            data_type,
                            self._rows,
                            row_names=self._row_names)
            new_columns.append(column)

        self._columns = MappedSequence(new_columns, self._column_names)