def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} text_type = agate.Text(**type_kwargs) if self.args.no_inference: types = [text_type] else: number_type = agate.Number(locale=self.args.locale, **type_kwargs) # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), text_type, ] # In order to parse dates like "20010101". if self.args.date_format or self.args.datetime_format: types.insert(-1, number_type) else: types.insert(1, number_type) return agate.TypeTester(types=types)
def from_sql(cls, connection_or_string, table_name): """ Create a new :class:`agate.Table` from a given SQL table. Types will be inferred from the database schema. Monkey patched as class method :meth:`Table.from_sql`. :param connection_or_string: An existing sqlalchemy connection or connection string. :param table_name: The name of a table in the referenced database. """ engine, connection = get_engine_and_connection(connection_or_string) metadata = MetaData(connection) sql_table = Table(table_name, metadata, autoload=True, autoload_with=connection) column_names = [] column_types = [] for sql_column in sql_table.columns: column_names.append(sql_column.name) if type(sql_column.type) in INTERVAL_MAP.values(): py_type = datetime.timedelta else: py_type = sql_column.type.python_type if py_type in [int, float, decimal.Decimal]: if py_type is float: sql_column.type.asdecimal = True column_types.append(agate.Number()) elif py_type is bool: column_types.append(agate.Boolean()) elif issubclass(py_type, six.string_types): column_types.append(agate.Text()) elif py_type is datetime.date: column_types.append(agate.Date()) elif py_type is datetime.datetime: column_types.append(agate.DateTime()) elif py_type is datetime.timedelta: column_types.append(agate.TimeDelta()) else: raise ValueError('Unsupported sqlalchemy column type: %s' % type(sql_column.type)) s = select([sql_table]) rows = connection.execute(s) try: return agate.Table(rows, column_names, column_types) finally: if engine is not None: connection.close() engine.dispose()
def _get_tester_for(self, column_type): from dbt.clients import agate_helper if column_type is agate.TimeDelta: # dbt never makes this! return agate.TimeDelta() for instance in agate_helper.DEFAULT_TYPE_TESTER._possible_types: if type(instance) is column_type: return instance raise ValueError(f'no tester for {column_type}')
def get_column_types(self): if getattr(self.args, 'blanks', None): text_type = agate.Text(cast_nulls=False) else: text_type = agate.Text() if self.args.no_inference: return agate.TypeTester(types=[text_type]) else: return agate.TypeTester(types=[ agate.Boolean(), agate.Number(locale=self.args.locale), agate.TimeDelta(), agate.Date(date_format=self.args.date_format), agate.DateTime(datetime_format=self.args.datetime_format), text_type ])
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.Number(locale=self.args.locale, **type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), ] + types return agate.TypeTester(types=types)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), # This is a different order than agate's default, in order to parse dates like "20010101". agate.Number(locale=self.args.locale, **type_kwargs), ] + types return agate.TypeTester(types=types)
def setUp(self): self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM', '6:18'), (None, 'b', None, None, None, None)) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', 'timedelta' ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), agate.TimeDelta() ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def setUp(self): self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM', '4h45m'), ('2', 'b', 'False', '11/01/2015', '11/01/2015 12:45 PM', '3h25m'), ('', '', '', '', '', '')) self.number_type = agate.Number() self.text_type = agate.Text() self.boolean_type = agate.Boolean() self.date_type = agate.Date() self.datetime_type = agate.DateTime() self.timedelta_type = agate.TimeDelta() self.column_names = ('number', 'text', 'boolean', 'date', 'datetime', 'timedelta') self.column_types = (self.number_type, self.text_type, self.boolean_type, self.date_type, self.datetime_type, self.timedelta_type) self.table = agate.Table(self.rows, zip(self.column_names, self.column_types))
def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, **kwargs): """ Parse an XLS file. :param path: Path to an XLS file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the first sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') if hasattr(path, 'read'): book = xlrd.open_workbook(file_contents=path.read(), encoding_override=encoding_override) else: with open(path, 'rb') as f: book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override) multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): sheet = book.sheet_by_name(sheet) elif isinstance(sheet, int): sheet = book.sheet_by_index(sheet) else: sheet = book.sheet_by_index(0) if header: offset = 1 column_names = [] else: offset = 0 column_names = None columns = [] column_types = [] for i in range(sheet.ncols): data = sheet.col_values(i) values = data[skip_lines + offset:] types = sheet.col_types(i)[skip_lines + offset:] excel_type = determine_excel_type(types) agate_type = determine_agate_type(excel_type) if excel_type == xlrd.biffh.XL_CELL_BOOLEAN: values = normalize_booleans(values) elif excel_type == xlrd.biffh.XL_CELL_DATE: values, with_date, with_time = normalize_dates( values, book.datemode) if not with_date: agate_type = agate.TimeDelta() if not with_time: agate_type = agate.Date() if header: name = six.text_type(data[skip_lines]) or None column_names.append(name) columns.append(values) column_types.append(agate_type) rows = [] if columns: for i in range(len(columns[0])): rows.append([c[i] for c in columns]) if 'column_names' in kwargs: if not header: column_names = kwargs['column_names'] del kwargs['column_names'] if 'column_types' in kwargs: column_types = kwargs['column_types'] del kwargs['column_types'] tables[sheet.name] = agate.Table(rows, column_names, column_types, **kwargs) if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]
def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, row_limit=None, column_names=None, column_types=None, **kwargs): """ Parse an XLS file. :param path: Path to an XLS file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the first sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. :param row_limit: Limit how many rows of data will be read :param column_names: See :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') def open_workbook(f): try: book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override, on_demand=True) except xlrd.compdoc.CompDocError: # This is not a pure XLS file; we'll try to read it though. # Let's try the Compound File Binary Format: ole = olefile.OleFileIO(f) if ole.exists('Workbook'): d = ole.openstream('Workbook') book = xlrd.open_workbook(file_contents=d.read(), on_demand=True) else: raise IOError('No Workbook stream found in OLE file') return book if hasattr(path, 'read'): book = open_workbook(path) else: with open(path, 'rb') as f: book = open_workbook(f) try: multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): sheet = book.sheet_by_name(sheet) elif isinstance(sheet, int): sheet = book.sheet_by_index(sheet) else: sheet = book.sheet_by_index(0) if header: offset = 1 column_names_detected = [] else: offset = 0 column_names_detected = None columns = [] column_types_detected = [] for i in range(sheet.ncols): if row_limit is None: values = sheet.col_values(i, skip_lines + offset) types = sheet.col_types(i, skip_lines + offset) else: values = sheet.col_values(i, skip_lines + offset, skip_lines + offset + row_limit) types = sheet.col_types(i, skip_lines + offset, skip_lines + offset + row_limit) excel_type = determine_excel_type(types) agate_type = determine_agate_type(excel_type) if excel_type == xlrd.biffh.XL_CELL_BOOLEAN: values = normalize_booleans(values) elif excel_type == xlrd.biffh.XL_CELL_DATE: values, with_date, with_time = normalize_dates( values, book.datemode) if not with_date: agate_type = agate.TimeDelta() if not with_time: agate_type = agate.Date() if header: name = six.text_type(sheet.cell_value(skip_lines, i)) or None column_names_detected.append(name) columns.append(values) column_types_detected.append(agate_type) rows = [] if columns: for i in range(len(columns[0])): rows.append([c[i] for c in columns]) if column_names is None: sheet_column_names = column_names_detected else: sheet_column_names = column_names sheet_column_types = column_types if isinstance(column_types, dict) and sheet_column_names is not None: sheet_column_types = dict( zip(sheet_column_names, column_types_detected)) sheet_column_types.update(column_types) tables[sheet.name] = agate.Table(rows, sheet_column_names, sheet_column_types, **kwargs) finally: book.release_resources() if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]