def load_year_killed_data(year): specified_types = { 'killed': agate.Number(), 'injured': agate.Number(), 'date_hour': agate.Text() } return agate.Table.from_url('https://s3.amazonaws.com/traffic-sd/accidents_killed_{}.csv'.format(year), column_types=specified_types)
def _analyze_date(self, event): table = self.table.where( lambda row: row["fields/date"] is not None).compute([ ( "reduce_to_date", agate.Formula( agate.Text(), lambda row: helpers.reduce_to_date(row["fields/date"]), ), ), ( "reduce_to_year", agate.Formula( agate.Number(), lambda row: helpers.reduce_to_year(row["fields/date"]), ), ), ( "reduce_to_time", agate.Formula( agate.Number(), lambda row: helpers.reduce_to_time(row["fields/date"]), ), ), ]) years = table.distinct( "reduce_to_year").columns["reduce_to_year"].values() _data = {} for year in years: _data[year] = (table.where(lambda row: row[ "reduce_to_year"] == year).select("reduce_to_date").pivot( "reduce_to_date").order_by("reduce_to_date")) event.set() print(f"\n\n{helpers.h1_icn} Date\n") for year in years: data_keys = list(_data[year].columns["reduce_to_date"].values()) _counts = list( map(int, list(_data[year].columns["Count"].values()))) _sum = sum(_counts) data_count = [[i] for i in _counts] args = { "color": False, "custom_tick": False, "start_dt": f"{year}-01-01" } print(f"\n{helpers.h2_icn} Year {year} ({_sum:,} emails)\n") calendar_heatmap(data=data_count, args=args, labels=data_keys)
def calc_table(in_csv, out_csv): table = agate.Table.from_csv(in_csv, column_names=column_names, column_types=column_types) table = table.pivot('HIEDUC', 'STAYTOG') table = table.compute([('Total', agate.Formula(agate.Number(), get_total))]) table = table.compute([('Percent agree', agate.Formula(agate.Number(), get_percent_agree))]) table.to_csv(out_csv) return table
def setUp(self): self.rows = ( (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types) self.connection_string = 'sqlite:///:memory:'
def test_create_if_not_exists(self): column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] rows1 = ( (1, 'Jake'), (2, 'Howard'), ) rows2 = ( (3, 'Liz'), (4, 'Tim'), ) table1 = agate.Table(rows1, column_names, column_types) table2 = agate.Table(rows2, column_names, column_types) engine = create_engine(self.connection_string) connection = engine.connect() # Write two agate tables into the same SQL table table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table = agate.Table.from_sql(connection, 'create_if_not_exists_test') self.assertSequenceEqual(table.column_names, column_names) self.assertIsInstance(table.column_types[0], agate.Number) self.assertIsInstance(table.column_types[1], agate.Text) self.assertEqual(len(table.rows), len(table1.rows) + len(table1.rows)) self.assertSequenceEqual(table.rows[0], table1.rows[0])
def setUp(self): self.rows = ( (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.user_provided_column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def setUp(self): text_type = agate.Text() number_type = agate.Number() columns = ( ('gender', text_type), ('month', number_type), ('median', number_type), ('stdev', number_type), ('1st', number_type), ('3rd', number_type), ('5th', number_type), ('15th', number_type), ('25th', number_type), ('50th', number_type), ('75th', number_type), ('85th', number_type), ('95th', number_type), ('97th', number_type), ('99th', number_type) ) with open('examples/heights.csv') as f: # Create a csv reader reader = csv.reader(f) # Skip header next(f) # Create the table self.table = agate.Table(reader, columns) if os.path.exists(TEST_FILENAME): os.remove(TEST_FILENAME)
def compute_ranks(table): table = table.compute([ ('dataset_rank', agate.Rank('datasets', reverse=True)), ('formats_rank', agate.Rank('format_count', reverse=True)), #('open_formats_rank', agate.Rank('open_formats', reverse=True)), ('last_update_rank', agate.Rank('days_since_last_update')), #('open_datasets_rank', agate.Rank('open_datasets', reverse=True)), ('category_rank', agate.Rank('category_count', reverse=True)), ('category_variance_rank', agate.Rank('category_variance')), #('update_start_rank', agate.Rank('days_between_start_and_last_update')), #('start_rank', agate.Rank('days_since_start', reverse=True)), #('openess_score', agate.Formula(number, openness_score)), ('dataset_score_rank', agate.Rank('dataset_score', reverse=True)), ('category_score_rank', agate.Rank('category_score', reverse=True)), ]) table = table.compute([ ('dataset_rank_std', StandadizeScore('dataset_rank')), ('formats_rank_std', StandadizeScore('formats_rank')), ('last_update_rank_std', StandadizeScore('last_update_rank')), ('category_rank_std', StandadizeScore('category_rank')), ('category_variance_rank_std', StandadizeScore('category_variance_rank')), ('dataset_score_rank_std', StandadizeScore('dataset_score_rank')), ('category_score_rank_std', StandadizeScore('category_score_rank')), ]) table = table.compute([ ('overall_rank_data', agate.Formula(agate.Number(), overall_rank)) ]) table = table.compute([ ('overall_rank', agate.Rank('overall_rank_data')), ]) return table
def _overall_stats(self): count_open_licenses = agate.Summary( 'license_id', agate.Number(), lambda r: sum(license_id in utils.OPEN_LICENSES for license_id in r.values())) self.overall_package_stats = self._package_table().aggregate([ ('open_data_count', count_open_licenses), ]) self.resource_stats = self._package_resource_table().compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)), ]) if len(self._package_resource_table()) > 0: self.resource_stats = self.resource_stats.aggregate([ ('open_format_count', agate.Count('open_format', True)), ('min_date', agate.Min('created')), ('max_date', agate.Max('created')) ]) format_table = self._package_resource_table().group_by( "format").aggregate([ ('count', agate.Count()), ]) count = format_table.aggregate([ ('different_formats', agate.Count()), ]) self.open_datasets = self.overall_package_stats.get( "open_data_count", 0) self.open_format_count = self.resource_stats.get( "open_format_count", 0) self.format_count = count.get("different_formats", 0) self.compute_dates()
def get_package_stats(self, package_table): count_open_licenses = agate.Summary( 'license_id', agate.Number(), lambda r: sum(license_id in utils.OPEN_LICENSES for license_id in r.values())) return package_table.aggregate([('open_data_count', count_open_licenses)])
def test_chunk_size(self): column_names = ['number'] column_types = [agate.Number()] rows = [] expected = 0 for n in range(9999): rows.append((n, )) expected += n engine = create_engine(self.connection_string) connection = engine.connect() try: table = agate.Table(rows, column_names, column_types) table.to_sql(connection, 'test_chunk_size', overwrite=True, chunk_size=100) table = agate.Table.from_sql(connection, 'test_chunk_size') actual = sum(r[0] for r in table.rows) self.assertEqual(len(table.rows), len(rows)) self.assertEqual(expected, actual) finally: connection.close() engine.dispose()
def load_data(data): """ Load the dataset. """ text_type = agate.Text() number_type = agate.Number() columns = OrderedDict([ ('year', number_type), ('residence', text_type), ('origin', text_type), ('refugees', number_type), ('asylum_seekers', number_type), ('returned_refugees', number_type), ('idps', number_type), ('returned_idps', number_type), ('stateless_persons', number_type), ('others', number_type), ('total', number_type), ]) # Load the data with open('unhcr_popstats_export_persons_of_concern_2016_01_12_192533.csv' ) as f: reader = csvkit.reader(f) next(reader) rows = [] for row in reader: rows.append([None if d == '*' else d for d in row]) data['table'] = agate.Table(rows, columns.keys(), columns.values())
def load_data(data): text_type = agate.Text() number_type = agate.Number() boolean_type = agate.Boolean() columns = ( ('last_name', text_type), ('first_name', text_type), ('age', number_type), ('race', text_type), ('state', text_type), ('tags', text_type), ('crime', text_type), ('sentence', text_type), ('convicted', number_type), ('exonerated', number_type), ('dna', boolean_type), ('dna_essential', text_type), ('mistaken_witness', boolean_type), ('false_confession', boolean_type), ('perjury', boolean_type), ('false_evidence', boolean_type), ('official_misconduct', boolean_type), ('inadequate_defense', boolean_type), ) with open('examples/realdata/exonerations-20150828.csv') as f: # Create a csv reader reader = csv.reader(f) # Skip header next(f) # Create the table data['exonerations'] = agate.Table(reader, columns)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} text_type = agate.Text(**type_kwargs) if self.args.no_inference: types = [text_type] else: number_type = agate.Number(locale=self.args.locale, **type_kwargs) # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), text_type, ] # In order to parse dates like "20010101". if self.args.date_format or self.args.datetime_format: types.insert(-1, number_type) else: types.insert(1, number_type) return agate.TypeTester(types=types)
def test_create_if_not_exists(self): column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] rows1 = ( (1, 'Jake'), (2, 'Howard'), ) rows2 = ( (3, 'Liz'), (4, 'Tim'), ) table1 = agate.Table(rows1, column_names, column_types) table2 = agate.Table(rows2, column_names, column_types) engine = create_engine(self.connection_string) connection = engine.connect() # Write two agate tables into the same SQL table table1.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True) table2.to_sql(connection, 'create_if_not_exists_test', create=True, create_if_not_exists=True, insert=True)
def from_sql(cls, connection_or_string, table_name): """ Create a new :class:`agate.Table` from a given SQL table. Types will be inferred from the database schema. Monkey patched as class method :meth:`Table.from_sql`. :param connection_or_string: An existing sqlalchemy connection or connection string. :param table_name: The name of a table in the referenced database. """ engine, connection = get_engine_and_connection(connection_or_string) metadata = MetaData(connection) sql_table = Table(table_name, metadata, autoload=True, autoload_with=connection) column_names = [] column_types = [] for sql_column in sql_table.columns: column_names.append(sql_column.name) if type(sql_column.type) in INTERVAL_MAP.values(): py_type = datetime.timedelta else: py_type = sql_column.type.python_type if py_type in [int, float, decimal.Decimal]: if py_type is float: sql_column.type.asdecimal = True column_types.append(agate.Number()) elif py_type is bool: column_types.append(agate.Boolean()) elif issubclass(py_type, six.string_types): column_types.append(agate.Text()) elif py_type is datetime.date: column_types.append(agate.Date()) elif py_type is datetime.datetime: column_types.append(agate.DateTime()) elif py_type is datetime.timedelta: column_types.append(agate.TimeDelta()) else: raise ValueError('Unsupported sqlalchemy column type: %s' % type(sql_column.type)) s = select([sql_table]) rows = connection.execute(s) try: return agate.Table(rows, column_names, column_types) finally: if engine is not None: connection.close() engine.dispose()
def _add_random_column(data_tbl): # Reset seed to produce random numbers npr.seed() new_table = data_tbl.compute([('random_group', agate.Formula(agate.Number(), _generate_random))]) return new_table
def test_load(self): tester = agate.TypeTester( force={ 'last_name': agate.Text(), 'first_name': agate.Text(), 'age': agate.Number() }) exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv', column_types=tester) print(exonerations) # 表的描述
def test_make_sql_table_min_col_len(self): rows = ((1, 'x' * 10), (2, '')) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) sql_table = agatesql.table.make_sql_table(table, 'test_table', dialect='mysql', db_schema='test_schema', constraints=True, min_col_len=20) self.assertEquals(sql_table.columns.get('name').type.length, 20)
def test_to_sql_create_statement_wide_width(self): rows = ((1, 'x' * 21845), (2, '')) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) statement = table.to_sql_create_statement('test_table', db_schema='test_schema', dialect='mysql') self.assertEqual(statement.replace('\t', ' '), '''CREATE TABLE test_schema.test_table ( id DECIMAL(38, 0) NOT NULL, name TEXT );''') # noqa
def get_types(example_row): types = [] for v in example_row: value_type = xlrd.sheet.ctype_text[v.ctype] if value_type == 'text': types.append(agate.Text()) elif value_type == 'number': types.append(agate.Number()) elif value_type == 'xldate': types.append(agate.Date()) else: types.append(agate.Text()) return types
def test_distinct_values(self): column_names: List = [ 'id', 'name', 'dob', 'last seen', 'size', 'active', ] column_types: List = [ agate.Number(), agate.Text(), agate.Date(), agate.DateTime(), agate.Text(), agate.Boolean(), ] rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L', True), (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S', False), (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00', 'M', True), (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00', 'S', True), (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L', True), (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00', 'M', False), (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00', 'M', False), (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00', 'XL', True), (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00', 'L', False), (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M', True)] model = csvhound.core.BaseHound() table = model.get_table_from_file('sample-data/test-distinct.csv') distinct = model.distinct_values('size') agate_table = agate.Table(rows, column_names, column_types) distinct_agate = agate_table.select('size').distinct('size') # now do the testing self.assertColumnNames(distinct, ('size', )) self.assertColumnTypes(distinct, [type(c) for c in distinct.column_types]) self.assertRows(distinct, distinct_agate)
def main(): df = pd.read_csv(SRC_PATH, dtype=str) counts = df['date'].value_counts().sort_index() # just get first 10 rows and last 10 rows counts = pd.concat([ counts.head(10), pd.Series({'...': None}, name='date'), counts.tail(10) ]) vals = [ [k, v] for k, v in counts.to_dict().items() ] # is there really no way to convert a Pandas series to list-of-lists? table = agate.Table(vals, ['date', 'count'], [agate.Text(), agate.Number()]) table.print_bars('date', 'count')
def test_to_sql_create_statement_zero_width(self): rows = ( (1, ''), (2, ''), ) column_names = ['id', 'name'] column_types = [agate.Number(), agate.Text()] table = agate.Table(rows, column_names, column_types) statement = table.to_sql_create_statement('test_table', db_schema='test_schema', dialect='mysql') self.assertIn('CREATE TABLE test_schema.test_table', statement) self.assertIn('id DECIMAL(38, 0) NOT NULL,', statement) self.assertIn('name VARCHAR(1)', statement)
def get_column_types(self): if getattr(self.args, 'blanks', None): text_type = agate.Text(cast_nulls=False) else: text_type = agate.Text() if self.args.no_inference: return agate.TypeTester(types=[text_type]) else: return agate.TypeTester(types=[ agate.Boolean(), agate.Number(locale=self.args.locale), agate.TimeDelta(), agate.Date(date_format=self.args.date_format), agate.DateTime(datetime_format=self.args.datetime_format), text_type ])
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.Number(locale=self.args.locale, **type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), ] + types return agate.TypeTester(types=types)
def sum_counts_by_hour(data): data['hour'] = data['table'].group_by('hour').aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()), ('accidents_injured', count_accidents_injured) ]).compute([ ('killed_percent', agate.Percent('killed')), ('injured_percent', agate.Percent('injured')), ('accidents_percent', agate.Percent('accidents')), ]).compute([ ('weighted', agate.Formula(agate.Number(), lambda r: r['killed_percent'] + r['injured_percent'])), ('accidents_within_half_deviation', StandardDeviations('accidents', 0.5)), ('killed_within_half_deviation', StandardDeviations('killed', 0.5)), ('injured_within_half_deviation', StandardDeviations('injured', 0.5)) ]) return data
def update_where(self, update_col, update_val, test_col, test_val): # check the types of the update and test columns. colnames = self.column_names types = [ self.column_types[colnames.index(update_col)], self.column_types[colnames.index(test_col)] ] # start the magic self = self.join(self.select([update_col, test_col]) \ .rename(column_names = {update_col: 'update_col', test_col: 'test_col'}) ) self = self.join( agate.Table([[update_val, test_val]] * len(self.rows), ['update_val', 'test_val'], types)) self = self.compute([('updated', agate.Formula(agate.Number(), update_where_function))]) self = self.rename(column_names = {update_col: 'old', 'updated': update_col}) \ .exclude(['old', 'update_col', 'test_col', 'update_val', 'test_val']) return self
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), # This is a different order than agate's default, in order to parse dates like "20010101". agate.Number(locale=self.args.locale, **type_kwargs), ] + types return agate.TypeTester(types=types)
def setUp(self): self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM', '6:18'), (None, 'b', None, None, None, None)) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', 'timedelta' ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), agate.TimeDelta() ] self.table = agate.Table(self.rows, self.column_names, self.column_types)