コード例 #1
0
ファイル: cli.py プロジェクト: vishalbelsare/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        text_type = agate.Text(**type_kwargs)

        if self.args.no_inference:
            types = [text_type]
        else:
            number_type = agate.Number(locale=self.args.locale, **type_kwargs)

            # See the order in the `agate.TypeTester` class.
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                text_type,
            ]

            # In order to parse dates like "20010101".
            if self.args.date_format or self.args.datetime_format:
                types.insert(-1, number_type)
            else:
                types.insert(1, number_type)

        return agate.TypeTester(types=types)
コード例 #2
0
def from_sql(cls, connection_or_string, table_name):
    """
    Create a new :class:`agate.Table` from a given SQL table. Types will be
    inferred from the database schema.

    Monkey patched as class method :meth:`Table.from_sql`.

    :param connection_or_string:
        An existing sqlalchemy connection or connection string.
    :param table_name:
        The name of a table in the referenced database.
    """
    engine, connection = get_engine_and_connection(connection_or_string)

    metadata = MetaData(connection)
    sql_table = Table(table_name,
                      metadata,
                      autoload=True,
                      autoload_with=connection)

    column_names = []
    column_types = []

    for sql_column in sql_table.columns:
        column_names.append(sql_column.name)

        if type(sql_column.type) in INTERVAL_MAP.values():
            py_type = datetime.timedelta
        else:
            py_type = sql_column.type.python_type

        if py_type in [int, float, decimal.Decimal]:
            if py_type is float:
                sql_column.type.asdecimal = True
            column_types.append(agate.Number())
        elif py_type is bool:
            column_types.append(agate.Boolean())
        elif issubclass(py_type, six.string_types):
            column_types.append(agate.Text())
        elif py_type is datetime.date:
            column_types.append(agate.Date())
        elif py_type is datetime.datetime:
            column_types.append(agate.DateTime())
        elif py_type is datetime.timedelta:
            column_types.append(agate.TimeDelta())
        else:
            raise ValueError('Unsupported sqlalchemy column type: %s' %
                             type(sql_column.type))

    s = select([sql_table])

    rows = connection.execute(s)

    try:
        return agate.Table(rows, column_names, column_types)
    finally:
        if engine is not None:
            connection.close()
            engine.dispose()
コード例 #3
0
    def _get_tester_for(self, column_type):
        from dbt.clients import agate_helper
        if column_type is agate.TimeDelta:  # dbt never makes this!
            return agate.TimeDelta()

        for instance in agate_helper.DEFAULT_TYPE_TESTER._possible_types:
            if type(instance) is column_type:
                return instance

        raise ValueError(f'no tester for {column_type}')
コード例 #4
0
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            text_type = agate.Text(cast_nulls=False)
        else:
            text_type = agate.Text()

        if self.args.no_inference:
            return agate.TypeTester(types=[text_type])
        else:
            return agate.TypeTester(types=[
                agate.Boolean(),
                agate.Number(locale=self.args.locale),
                agate.TimeDelta(),
                agate.Date(date_format=self.args.date_format),
                agate.DateTime(datetime_format=self.args.datetime_format),
                text_type
            ])
コード例 #5
0
ファイル: cli.py プロジェクト: leonqli/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.Number(locale=self.args.locale, **type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
コード例 #6
0
ファイル: cli.py プロジェクト: v838/csvkit
    def get_column_types(self):
        if getattr(self.args, 'blanks', None):
            type_kwargs = {'null_values': ()}
        else:
            type_kwargs = {}

        types = [agate.Text(**type_kwargs)]

        if not self.args.no_inference:
            types = [
                agate.Boolean(**type_kwargs),
                agate.TimeDelta(**type_kwargs),
                agate.Date(date_format=self.args.date_format, **type_kwargs),
                agate.DateTime(datetime_format=self.args.datetime_format,
                               **type_kwargs),
                # This is a different order than agate's default, in order to parse dates like "20010101".
                agate.Number(locale=self.args.locale, **type_kwargs),
            ] + types

        return agate.TypeTester(types=types)
コード例 #7
0
    def setUp(self):
        self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'),
                     (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM',
                      '6:18'), (None, 'b', None, None, None, None))

        self.column_names = [
            'number', 'text', 'boolean', 'date', 'datetime', 'timedelta'
        ]

        self.column_types = [
            agate.Number(),
            agate.Text(),
            agate.Boolean(),
            agate.Date(),
            agate.DateTime(),
            agate.TimeDelta()
        ]

        self.table = agate.Table(self.rows, self.column_names,
                                 self.column_types)
コード例 #8
0
    def setUp(self):
        self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM',
                      '4h45m'), ('2', 'b', 'False', '11/01/2015',
                                 '11/01/2015 12:45 PM', '3h25m'), ('', '', '',
                                                                   '', '', ''))

        self.number_type = agate.Number()
        self.text_type = agate.Text()
        self.boolean_type = agate.Boolean()
        self.date_type = agate.Date()
        self.datetime_type = agate.DateTime()
        self.timedelta_type = agate.TimeDelta()

        self.column_names = ('number', 'text', 'boolean', 'date', 'datetime',
                             'timedelta')
        self.column_types = (self.number_type, self.text_type,
                             self.boolean_type, self.date_type,
                             self.datetime_type, self.timedelta_type)

        self.table = agate.Table(self.rows,
                                 zip(self.column_names, self.column_types))
コード例 #9
0
def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    if hasattr(path, 'read'):
        book = xlrd.open_workbook(file_contents=path.read(),
                                  encoding_override=encoding_override)
    else:
        with open(path, 'rb') as f:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override)

    multiple = agate.utils.issequence(sheet)
    if multiple:
        sheets = sheet
    else:
        sheets = [sheet]

    tables = OrderedDict()

    for i, sheet in enumerate(sheets):
        if isinstance(sheet, six.string_types):
            sheet = book.sheet_by_name(sheet)
        elif isinstance(sheet, int):
            sheet = book.sheet_by_index(sheet)
        else:
            sheet = book.sheet_by_index(0)

        if header:
            offset = 1
            column_names = []
        else:
            offset = 0
            column_names = None

        columns = []
        column_types = []

        for i in range(sheet.ncols):
            data = sheet.col_values(i)
            values = data[skip_lines + offset:]
            types = sheet.col_types(i)[skip_lines + offset:]
            excel_type = determine_excel_type(types)
            agate_type = determine_agate_type(excel_type)

            if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                values = normalize_booleans(values)
            elif excel_type == xlrd.biffh.XL_CELL_DATE:
                values, with_date, with_time = normalize_dates(
                    values, book.datemode)
                if not with_date:
                    agate_type = agate.TimeDelta()
                if not with_time:
                    agate_type = agate.Date()

            if header:
                name = six.text_type(data[skip_lines]) or None
                column_names.append(name)

            columns.append(values)
            column_types.append(agate_type)

        rows = []

        if columns:
            for i in range(len(columns[0])):
                rows.append([c[i] for c in columns])

        if 'column_names' in kwargs:
            if not header:
                column_names = kwargs['column_names']
            del kwargs['column_names']

        if 'column_types' in kwargs:
            column_types = kwargs['column_types']
            del kwargs['column_types']

        tables[sheet.name] = agate.Table(rows, column_names, column_types,
                                         **kwargs)

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]
コード例 #10
0
ファイル: table_xls.py プロジェクト: lcorbasson/agate-excel
def from_xls(cls,
             path,
             sheet=None,
             skip_lines=0,
             header=True,
             encoding_override=None,
             row_limit=None,
             column_names=None,
             column_types=None,
             **kwargs):
    """
    Parse an XLS file.

    :param path:
        Path to an XLS file to load or a file-like object for one.
    :param sheet:
        The names or integer indices of the worksheets to load. If not specified
        then the first sheet will be used.
    :param skip_lines:
        The number of rows to skip from the top of the sheet.
    :param header:
        If :code:`True`, the first row is assumed to contain column names.
    :param row_limit:
        Limit how many rows of data will be read
    :param column_names:
        See :meth:`.Table.__init__`.
    :param column_types:
        See :meth:`.Table.__init__`.
    """
    if not isinstance(skip_lines, int):
        raise ValueError('skip_lines argument must be an int')

    def open_workbook(f):
        try:
            book = xlrd.open_workbook(file_contents=f.read(),
                                      encoding_override=encoding_override,
                                      on_demand=True)
        except xlrd.compdoc.CompDocError:
            # This is not a pure XLS file; we'll try to read it though.
            # Let's try the Compound File Binary Format:
            ole = olefile.OleFileIO(f)
            if ole.exists('Workbook'):
                d = ole.openstream('Workbook')
                book = xlrd.open_workbook(file_contents=d.read(),
                                          on_demand=True)
            else:
                raise IOError('No Workbook stream found in OLE file')
        return book

    if hasattr(path, 'read'):
        book = open_workbook(path)
    else:
        with open(path, 'rb') as f:
            book = open_workbook(f)

    try:
        multiple = agate.utils.issequence(sheet)
        if multiple:
            sheets = sheet
        else:
            sheets = [sheet]

        tables = OrderedDict()

        for i, sheet in enumerate(sheets):
            if isinstance(sheet, six.string_types):
                sheet = book.sheet_by_name(sheet)
            elif isinstance(sheet, int):
                sheet = book.sheet_by_index(sheet)
            else:
                sheet = book.sheet_by_index(0)

            if header:
                offset = 1
                column_names_detected = []
            else:
                offset = 0
                column_names_detected = None

            columns = []
            column_types_detected = []

            for i in range(sheet.ncols):
                if row_limit is None:
                    values = sheet.col_values(i, skip_lines + offset)
                    types = sheet.col_types(i, skip_lines + offset)
                else:
                    values = sheet.col_values(i, skip_lines + offset,
                                              skip_lines + offset + row_limit)
                    types = sheet.col_types(i, skip_lines + offset,
                                            skip_lines + offset + row_limit)
                excel_type = determine_excel_type(types)
                agate_type = determine_agate_type(excel_type)

                if excel_type == xlrd.biffh.XL_CELL_BOOLEAN:
                    values = normalize_booleans(values)
                elif excel_type == xlrd.biffh.XL_CELL_DATE:
                    values, with_date, with_time = normalize_dates(
                        values, book.datemode)
                    if not with_date:
                        agate_type = agate.TimeDelta()
                    if not with_time:
                        agate_type = agate.Date()

                if header:
                    name = six.text_type(sheet.cell_value(skip_lines,
                                                          i)) or None
                    column_names_detected.append(name)

                columns.append(values)
                column_types_detected.append(agate_type)

            rows = []

            if columns:
                for i in range(len(columns[0])):
                    rows.append([c[i] for c in columns])

            if column_names is None:
                sheet_column_names = column_names_detected
            else:
                sheet_column_names = column_names

            sheet_column_types = column_types
            if isinstance(column_types,
                          dict) and sheet_column_names is not None:
                sheet_column_types = dict(
                    zip(sheet_column_names, column_types_detected))
                sheet_column_types.update(column_types)

            tables[sheet.name] = agate.Table(rows, sheet_column_names,
                                             sheet_column_types, **kwargs)

    finally:
        book.release_resources()

    if multiple:
        return agate.MappedSequence(tables.values(), tables.keys())
    else:
        return tables.popitem()[1]