def test_get_catalog_various_schemas(self, mock_get_schemas, mock_execute): column_names = ['table_database', 'table_schema', 'table_name'] rows = [ ('dbt', 'foo', 'bar'), ('dbt', 'FOO', 'baz'), ('dbt', None, 'bar'), ('dbt', 'quux', 'bar'), ('dbt', 'skip', 'bar'), ] mock_execute.return_value = agate.Table(rows=rows, column_names=column_names) mock_get_schemas.return_value.items.return_value = [ (mock.MagicMock(database='dbt'), {'foo', 'FOO', 'quux'}) ] mock_manifest = mock.MagicMock() mock_manifest.get_used_schemas.return_value = {('dbt', 'foo'), ('dbt', 'quux')} catalog, exceptions = self.adapter.get_catalog(mock_manifest) self.assertEqual(set(map(tuple, catalog)), {('dbt', 'foo', 'bar'), ('dbt', 'FOO', 'baz'), ('dbt', 'quux', 'bar')}) self.assertEqual(exceptions, [])
def load_data(data): text_type = agate.Text() number_type = agate.Number() boolean_type = agate.Boolean() columns = ( ('last_name', text_type), ('first_name', text_type), ('age', number_type), ('race', text_type), ('state', text_type), ('tags', text_type), ('crime', text_type), ('sentence', text_type), ('convicted', number_type), ('exonerated', number_type), ('dna', boolean_type), ('dna_essential', text_type), ('mistaken_witness', boolean_type), ('false_confession', boolean_type), ('perjury', boolean_type), ('false_evidence', boolean_type), ('official_misconduct', boolean_type), ('inadequate_defense', boolean_type), ) with open('examples/realdata/exonerations-20150828.csv') as f: # Create a csv reader reader = csv.reader(f) # Skip header next(f) # Create the table data['exonerations'] = agate.Table(reader, columns)
def setUp(self): text_type = agate.Text() number_type = agate.Number() columns = (('gender', text_type), ('month', number_type), ('median', number_type), ('stdev', number_type), ('1st', number_type), ('3rd', number_type), ('5th', number_type), ('15th', number_type), ('25th', number_type), ('50th', number_type), ('75th', number_type), ('85th', number_type), ('95th', number_type), ('97th', number_type), ('99th', number_type)) with open('examples/heights.csv') as f: # Create a csv reader reader = csv.reader(f) # Skip header next(f) # Create the table self.table = agate.Table(reader, columns) if os.path.exists(TEST_FILENAME): os.remove(TEST_FILENAME)
def sql_query(self, query, table_name='agate'): """ Convert this agate table into an intermediate, in-memory sqlite table, run a query against it, and then return the results as a new agate table. Multiple queries may be separated with semicolons. :param query: One SQL query, or multiple queries to be run consecutively separated with semicolons. :param table_name: The name to use for the table in the queries, defaults to ``agate``. """ _, connection = get_engine_and_connection() # Execute the specified SQL queries queries = query.split(';') rows = None self.to_sql(connection, table_name) for q in queries: if q: rows = connection.execute(q) table = agate.Table(list(rows), column_names=rows._metadata.keys) return table
def write_weighted_means_csv(): column_names = ['county_type'] column_types = [text_type] for age in ages: for income in incomes: column_names.append('weighted_mean_{0}yo_{1}k'.format(age, income)) column_types.append(number_type) county_types = [(rural_weighted, 'rural'), (small_towns_weighted, 'small_towns'), (metro_weighted, 'metro')] rows = [] for county_type in county_types: row = [county_type[1]] total_population = county_type[0].aggregate(agate.Sum('Population')) for age in ages: for income in incomes: score = county_type[0].aggregate( agate.Sum('weighted_score_{0}yo_{1}k'.format(age, income))) row.append(score / total_population) rows.append(row) table = agate.Table(rows, column_names, column_types).to_csv('data/output/weighted_means.csv')
def get_columns_in_relation(self, relation): _, results = self.connections.execute(f"pragma {relation.schema}.table_info({relation.identifier})", fetch=True) new_rows = [] for row in results: new_row = [ row[1], row[2] or 'TEXT', None, None, None ] new_rows.append(new_row) column_names = [ 'column_name', 'data_type', 'character_maximum_length', 'numeric_precision', 'numeric_scale' ] table = agate.Table(new_rows, column_names) kwargs = { 'table': table } result = self.execute_macro( 'sql_convert_columns_in_relation', kwargs=kwargs ) return result
def test_chunk_size(self): column_names = ['number'] column_types = [agate.Number()] rows = [] expected = 0 for n in range(9999): rows.append((n, )) expected += n engine = create_engine(self.connection_string) connection = engine.connect() try: table = agate.Table(rows, column_names, column_types) table.to_sql(connection, 'test_chunk_size', overwrite=True, chunk_size=100) table = agate.Table.from_sql(connection, 'test_chunk_size') actual = sum(r[0] for r in table.rows) self.assertEqual(len(table.rows), len(rows)) self.assertEqual(expected, actual) finally: connection.close() engine.dispose()
def setUp(self): self.rows = ( (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types) self.connection_string = 'sqlite:///:memory:'
def from_sql(cls, connection_or_string, table_name): """ Create a new :class:`agate.Table` from a given SQL table. Types will be inferred from the database schema. Monkey patched as class method :meth:`Table.from_sql`. :param connection_or_string: An existing sqlalchemy connection or connection string. :param table_name: The name of a table in the referenced database. """ engine, connection = get_engine_and_connection(connection_or_string) metadata = MetaData(connection) sql_table = Table(table_name, metadata, autoload=True, autoload_with=connection) column_names = [] column_types = [] for sql_column in sql_table.columns: column_names.append(sql_column.name) if type(sql_column.type) in INTERVAL_MAP.values(): py_type = datetime.timedelta else: py_type = sql_column.type.python_type if py_type in [int, float, decimal.Decimal]: if py_type is float: sql_column.type.asdecimal = True column_types.append(agate.Number()) elif py_type is bool: column_types.append(agate.Boolean()) elif issubclass(py_type, six.string_types): column_types.append(agate.Text()) elif py_type is datetime.date: column_types.append(agate.Date()) elif py_type is datetime.datetime: column_types.append(agate.DateTime()) elif py_type is datetime.timedelta: column_types.append(agate.TimeDelta()) else: raise ValueError('Unsupported sqlalchemy column type: %s' % type(sql_column.type)) s = select([sql_table]) rows = connection.execute(s) try: return agate.Table(rows, column_names, column_types) finally: if engine is not None: connection.close() engine.dispose()
def _make_table_of(self, rows, column_types): column_names = list(string.ascii_letters[:len(rows[0])]) if isinstance(column_types, type): column_types = [self._get_tester_for(column_types) for _ in column_names] else: column_types = [self._get_tester_for(typ) for typ in column_types] table = agate.Table(rows, column_names=column_names, column_types=column_types) return table
def get_table(new_arr, types, titles): """ Return an agate table when given an array of data, list of types and list of titles.""" try: table = agate.Table(new_arr, titles, types) return table except Exception as e: print(e)
def _get_one_catalog( self, information_schema: InformationSchema, schemas: Set[str], manifest: Manifest, ) -> agate.Table: """ bad form to override this method but... """ # this does N+1 queries but there doesn't seem to be # any other way to do this rows = [] for schema in schemas: schema_obj = self.Relation.create(database=information_schema.database, schema=schema) results = self.list_relations_without_caching(schema_obj) if len(results) > 0: for relation_row in results: name = relation_row.name relation_type = str(relation_row.type) table_info = self.connections.execute( f"pragma {schema}.table_info({name})", fetch=True) for table_row in table_info[1]: rows.append([ information_schema.database, schema, name, relation_type, '', '', table_row['name'], table_row['cid'], table_row['type'] or 'TEXT', '' ]) column_names = [ 'table_database', 'table_schema', 'table_name', 'table_type', 'table_comment', 'table_owner', 'column_name', 'column_index', 'column_type', 'column_comment' ] table = agate.Table(rows, column_names) results = self._catalog_filter_table(table, manifest) return results
def attr_test_output(self, obj_type, attr_name, results): # Load the data table = agate.Table(results, ['group', 'model', attr_name]) # Count the number with no __str__ fails = table.where(lambda row: row[attr_name] is False) if fails.rows: print("Fail: %s %ss are missing %s" % (len(fails.rows), obj_type, attr_name)) fails.select(["group", "model"]).print_table(max_column_width=50)
def table_from_rows( rows: List[Any], column_names: Iterable[str], text_only_columns: Optional[Iterable[str]] = None, ) -> agate.Table: if text_only_columns is None: column_types = DEFAULT_TYPE_TESTER else: column_types = build_type_tester(text_only_columns) return agate.Table(rows, column_names, column_types=column_types)
def from_xlsx(cls, path, sheet=None): """ Parse an XLSX file. :param path: Path to an XLSX file to load or a file or file-like object for one. :param sheet: The name or integer index of a worksheet to load. If not specified then the "active" sheet will be used. """ if hasattr(path, 'read'): f = path else: f = open(path, 'rb') book = openpyxl.load_workbook(f, read_only=True, data_only=True) if isinstance(sheet, six.string_types): sheet = book[sheet] elif isinstance(sheet, int): sheet = book.worksheets[sheet] else: sheet = book.active column_names = [] rows = [] for i, row in enumerate(sheet.rows): if i == 0: column_names = [c.value for c in row] continue values = [] for c in row: value = c.value if value.__class__ is datetime.datetime: # Handle default XLSX date as 00:00 time if value.date() == datetime.date(1904, 1, 1) and not has_date_elements(c): value = value.time() value = normalize_datetime(value) elif value.time() == NULL_TIME: value = value.date() else: value = normalize_datetime(value) values.append(value) rows.append(values) f.close() return agate.Table(rows, column_names)
def test_make_sql_table_min_col_len(self): rows = ((1, 'x' * 10), (2, '')) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) sql_table = agatesql.table.make_sql_table(table, 'test_table', dialect='mysql', db_schema='test_schema', constraints=True, min_col_len=20) self.assertEquals(sql_table.columns.get('name').type.length, 20)
def test_join(self): left_rows = [(six.text_type(i), i) for i in range(100000)] right_rows = [(six.text_type(i), i) for i in range(100000)] shuffle(left_rows) shuffle(right_rows) column_names = ['text', 'number'] column_types = [agate.Text(), agate.Number()] left = agate.Table(left_rows, column_names, column_types) right = agate.Table(right_rows, column_names, column_types) def test(): left.join(right, 'text') results = Timer(test).repeat(10, 1) min_time = min(results) self.assertLess(min_time, 10) # CI unreliable, 5s witnessed
def test_create_if_not_exists(self): column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] rows1 = ( (1, 'Jake'), (2, 'Howard'), ) rows2 = ( (3, 'Liz'), (4, 'Tim'), ) table1 = agate.Table(rows1, column_names, column_types) table2 = agate.Table(rows2, column_names, column_types) engine = create_engine(self.connection_string) connection = engine.connect() # Write two agate tables into the same SQL table table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True)
def from_dbf(cls, path, encoding=None): """ Parse a DBF file. :param path: Path to an DBF file to load. Note that due to limitations of the dependency you can not pass a file handle. It must be a path. """ dbf = DBF(path, load=True, encoding=encoding, recfactory=recfactory) table = agate.Table(dbf.records, column_names=dbf.field_names) return table
def test_to_sql_create_statement_wide_width(self): rows = ((1, 'x' * 21845), (2, '')) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) statement = table.to_sql_create_statement('test_table', db_schema='test_schema', dialect='mysql') self.assertEqual(statement.replace('\t', ' '), '''CREATE TABLE test_schema.test_table ( id DECIMAL(38, 0) NOT NULL, name TEXT );''') # noqa
def test_choices(self): """ Verify that valid choices are available for all expected fields on all models. """ # substrings that appear in choice fields choice_field_strs = [ '_cd', '_code', '_type', 'status', '_lvl', 'reportname', 'form_id', ] exceptions = [ 'LookupCodesCd.code_type', 'S497Cd.sup_off_cd', 'FilerStatusTypesCd.status_type', 'FilerStatusTypesCd.status_desc', 'FilerTypesCd.filer_type', ] results = [] model_list = sorted(get_model_list(), key=lambda x: (x().klass_group, x().klass_name)) for m in model_list: for f in m._meta.fields: if (any(x in f.name for x in choice_field_strs) and f.name != 'memo_code' and f.__class__ is not ForeignKeyField and '{}.{}'.format(m().klass_name, f.name) not in exceptions): if not f.choices: results.append((m().klass_group, m.__name__, f.name, "Has no CHOICES defined")) if not f.documentcloud_pages: results.append( (m().klass_group, m.__name__, f.name, "Has no `documentcloud_pages` defined")) # Pull out all the choices in that field for slug, name in f.choices: # Make sure that each has a definition if not name or name == '': results.append( (m().klass_group, m.__name__, f.name, "Value '%s' undefined in CHOICES" % slug)) table = agate.Table(results, ['group', 'model', 'field', 'message']) table.print_table(max_rows=None, max_column_width=50)
def test_lookup_require_match(self): rows = (('WA', ), ('VA', ), ('FA', )) column_names = ['usps'] column_types = [agate.Text()] table = agate.Table(rows, column_names, column_types) with self.assertRaises(ValueError): result = table.lookup('usps', 'state', require_match=True, source=self._source)
def table_from_data(data, column_names): "Convert list of dictionaries into an Agate table" # The agate table is generated from a list of dicts, so the column order # from `data` is not preserved. We can use `select` to reorder the columns # # If there is no data, create an empty table with the specified columns if len(data) == 0: return agate.Table([], column_names=column_names) else: table = agate.Table.from_object(data, column_types=DEFAULT_TYPE_TESTER) return table.select(column_names)
def dbf2csv(f, **kwargs): """ Convert a dBASE .dbf file to csv. """ with dbf.Table(f.name) as db: column_names = db.field_names table = agate.Table(db, column_names) output = six.StringIO() table.to_csv(output) result = output.getvalue() output.close() return result
def test_distinct_values(self): column_names: List = [ 'id', 'name', 'dob', 'last seen', 'size', 'active', ] column_types: List = [ agate.Number(), agate.Text(), agate.Date(), agate.DateTime(), agate.Text(), agate.Boolean(), ] rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L', True), (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S', False), (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00', 'M', True), (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00', 'S', True), (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L', True), (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00', 'M', False), (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00', 'M', False), (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00', 'XL', True), (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00', 'L', False), (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M', True)] model = csvhound.core.BaseHound() table = model.get_table_from_file('sample-data/test-distinct.csv') distinct = model.distinct_values('size') agate_table = agate.Table(rows, column_names, column_types) distinct_agate = agate_table.select('size').distinct('size') # now do the testing self.assertColumnNames(distinct, ('size', )) self.assertColumnTypes(distinct, [type(c) for c in distinct.column_types]) self.assertRows(distinct, distinct_agate)
def test_join(self): left_rows = [(six.text_type(i), i) for i in range(100000)] right_rows = [(six.text_type(i), i) for i in range(100000)] shuffle(left_rows) shuffle(right_rows) number_type = agate.Number() text_type = agate.Text() columns = (('text', text_type), ('number', number_type)) left = agate.Table(left_rows, columns) right = agate.Table(right_rows, columns) def test(): left.join(right, 'text') results = Timer(test).repeat(10, 1) min_time = min(results) self.assertLess(min_time, 0)
def test_lookup_no_match(self): rows = (('WA', ), ('VA', ), ('FA', )) column_names = ['usps'] column_types = [agate.Text()] table = agate.Table(rows, column_names, column_types) result = table.lookup('usps', 'state', source=self._source) self.assertColumnNames(result, ['usps', 'state']) self.assertColumnTypes(result, [agate.Text, agate.Text]) self.assertSequenceEqual(result.rows[2].values(), ['FA', None])
def table_from_data_flat(data, column_names): "Convert list of dictionaries into an Agate table" rows = [] for _row in data: row = [] for value in list(_row.values()): if isinstance(value, (dict, list, tuple)): row.append(json.dumps(value)) else: row.append(value) rows.append(row) return agate.Table(rows, column_names)
def main(): df = pd.read_csv(SRC_PATH, dtype=str) counts = df['date'].value_counts().sort_index() # just get first 10 rows and last 10 rows counts = pd.concat([ counts.head(10), pd.Series({'...': None}, name='date'), counts.tail(10) ]) vals = [ [k, v] for k, v in counts.to_dict().items() ] # is there really no way to convert a Pandas series to list-of-lists? table = agate.Table(vals, ['date', 'count'], [agate.Text(), agate.Number()]) table.print_bars('date', 'count')
def test_create_if_not_exists(self): column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] rows1 = ( (1, 'Jake'), (2, 'Howard'), ) rows2 = ( (3, 'Liz'), (4, 'Tim'), ) table1 = agate.Table(rows1, column_names, column_types) table2 = agate.Table(rows2, column_names, column_types) engine = create_engine(self.connection_string) connection = engine.connect() # Write two agate tables into the same SQL table table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table = agate.Table.from_sql(connection, 'create_if_not_exists_test') self.assertSequenceEqual(table.column_names, column_names) self.assertIsInstance(table.column_types[0], agate.Number) self.assertIsInstance(table.column_types[1], agate.Text) self.assertEqual(len(table.rows), len(table1.rows) + len(table1.rows)) self.assertSequenceEqual(table.rows[0], table1.rows[0])