def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} text_type = agate.Text(**type_kwargs) if self.args.no_inference: types = [text_type] else: number_type = agate.Number(locale=self.args.locale, **type_kwargs) # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), text_type, ] # In order to parse dates like "20010101". if self.args.date_format or self.args.datetime_format: types.insert(-1, number_type) else: types.insert(1, number_type) return agate.TypeTester(types=types)
def setUp(self): self.rows = ( (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.user_provided_column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def setUp(self): self.rows = ( (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types) self.connection_string = 'sqlite:///:memory:'
def load_data(data): text_type = agate.Text() number_type = agate.Number() boolean_type = agate.Boolean() columns = ( ('last_name', text_type), ('first_name', text_type), ('age', number_type), ('race', text_type), ('state', text_type), ('tags', text_type), ('crime', text_type), ('sentence', text_type), ('convicted', number_type), ('exonerated', number_type), ('dna', boolean_type), ('dna_essential', text_type), ('mistaken_witness', boolean_type), ('false_confession', boolean_type), ('perjury', boolean_type), ('false_evidence', boolean_type), ('official_misconduct', boolean_type), ('inadequate_defense', boolean_type), ) with open('examples/realdata/exonerations-20150828.csv') as f: # Create a csv reader reader = csv.reader(f) # Skip header next(f) # Create the table data['exonerations'] = agate.Table(reader, columns)
def _overall_stats(self): count_open_licenses = agate.Summary( 'license_id', agate.Number(), lambda r: sum(license_id in utils.OPEN_LICENSES for license_id in r.values())) self.overall_package_stats = self._package_table().aggregate([ ('open_data_count', count_open_licenses), ]) self.resource_stats = self._package_resource_table().compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)), ]) if len(self._package_resource_table()) > 0: self.resource_stats = self.resource_stats.aggregate([ ('open_format_count', agate.Count('open_format', True)), ('min_date', agate.Min('created')), ('max_date', agate.Max('created')) ]) format_table = self._package_resource_table().group_by( "format").aggregate([ ('count', agate.Count()), ]) count = format_table.aggregate([ ('different_formats', agate.Count()), ]) self.open_datasets = self.overall_package_stats.get( "open_data_count", 0) self.open_format_count = self.resource_stats.get( "open_format_count", 0) self.format_count = count.get("different_formats", 0) self.compute_dates()
def from_sql(cls, connection_or_string, table_name): """ Create a new :class:`agate.Table` from a given SQL table. Types will be inferred from the database schema. Monkey patched as class method :meth:`Table.from_sql`. :param connection_or_string: An existing sqlalchemy connection or connection string. :param table_name: The name of a table in the referenced database. """ engine, connection = get_engine_and_connection(connection_or_string) metadata = MetaData(connection) sql_table = Table(table_name, metadata, autoload=True, autoload_with=connection) column_names = [] column_types = [] for sql_column in sql_table.columns: column_names.append(sql_column.name) if type(sql_column.type) in INTERVAL_MAP.values(): py_type = datetime.timedelta else: py_type = sql_column.type.python_type if py_type in [int, float, decimal.Decimal]: if py_type is float: sql_column.type.asdecimal = True column_types.append(agate.Number()) elif py_type is bool: column_types.append(agate.Boolean()) elif issubclass(py_type, six.string_types): column_types.append(agate.Text()) elif py_type is datetime.date: column_types.append(agate.Date()) elif py_type is datetime.datetime: column_types.append(agate.DateTime()) elif py_type is datetime.timedelta: column_types.append(agate.TimeDelta()) else: raise ValueError('Unsupported sqlalchemy column type: %s' % type(sql_column.type)) s = select([sql_table]) rows = connection.execute(s) try: return agate.Table(rows, column_names, column_types) finally: if engine is not None: connection.close() engine.dispose()
def get_org_format_aggregates(self, package_table): # format can not exist!?! format_table = package_table.group_by("format").aggregate([ ('count', agate.Count()), ]) open_format_table = package_table.compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)) ]) open_format_table_aggregates = open_format_table.aggregate([ ('open_formats', agate.Count('open_format', True)), ]) new_table = format_table.compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)) ]) count = new_table.aggregate([ ('different_formats', agate.Count()), ('open_formats', agate.Count('open_format', True)), ]) count["open_formats_datasets"] = open_format_table_aggregates[ "open_formats"] return count
def test_distinct_values(self): column_names: List = [ 'id', 'name', 'dob', 'last seen', 'size', 'active', ] column_types: List = [ agate.Number(), agate.Text(), agate.Date(), agate.DateTime(), agate.Text(), agate.Boolean(), ] rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L', True), (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S', False), (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00', 'M', True), (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00', 'S', True), (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L', True), (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00', 'M', False), (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00', 'M', False), (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00', 'XL', True), (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00', 'L', False), (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M', True)] model = csvhound.core.BaseHound() table = model.get_table_from_file('sample-data/test-distinct.csv') distinct = model.distinct_values('size') agate_table = agate.Table(rows, column_names, column_types) distinct_agate = agate_table.select('size').distinct('size') # now do the testing self.assertColumnNames(distinct, ('size', )) self.assertColumnTypes(distinct, [type(c) for c in distinct.column_types]) self.assertRows(distinct, distinct_agate)
def test_grouping(self): exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv') clean_state_data = exonerations.compute( [('federal', agate.Formula(agate.Boolean(), lambda row: row['state'].startswith('F-'))), ('state', agate.Formula( agate.Text(), lambda row: row['state'][2:] if row['state'].startswith('F-') else row['state']))], replace=True) by_state = clean_state_data.group_by('state') state_totals = by_state.aggregate([('count', agate.Count())]) sorted_totals = state_totals.order_by('count', reverse=True) sorted_totals.print_table(max_rows=10)
def get_column_types(self): if getattr(self.args, 'blanks', None): text_type = agate.Text(cast_nulls=False) else: text_type = agate.Text() if self.args.no_inference: return agate.TypeTester(types=[text_type]) else: return agate.TypeTester(types=[ agate.Boolean(), agate.Number(locale=self.args.locale), agate.TimeDelta(), agate.Date(date_format=self.args.date_format), agate.DateTime(datetime_format=self.args.datetime_format), text_type ])
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.Number(locale=self.args.locale, **type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), ] + types return agate.TypeTester(types=types)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), # This is a different order than agate's default, in order to parse dates like "20010101". agate.Number(locale=self.args.locale, **type_kwargs), ] + types return agate.TypeTester(types=types)
def setUp(self): self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM', '6:18'), (None, 'b', None, None, None, None)) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', 'timedelta' ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), agate.TimeDelta() ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def setUp(self): self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM', '4h45m'), ('2', 'b', 'False', '11/01/2015', '11/01/2015 12:45 PM', '3h25m'), ('', '', '', '', '', '')) self.number_type = agate.Number() self.text_type = agate.Text() self.boolean_type = agate.Boolean() self.date_type = agate.Date() self.datetime_type = agate.DateTime() self.timedelta_type = agate.TimeDelta() self.column_names = ('number', 'text', 'boolean', 'date', 'datetime', 'timedelta') self.column_types = (self.number_type, self.text_type, self.boolean_type, self.date_type, self.datetime_type, self.timedelta_type) self.table = agate.Table(self.rows, zip(self.column_names, self.column_types))
title_rows = zip(sheet.row_values(4), sheet.row_values(5)) print(title_rows) titles = [t[0] + ' ' + t[1] for t in title_rows] titles = [t.strip() for t in titles] titles country_rows = [sheet.row_values(r) for r in range(6, 114)] country_rows from xlrd.sheet import ctype_text import agate text_type = agate.Text() number_type = agate.Number() boolean_type = agate.Boolean() date_type = agate.Date() example_row = sheet.row(6) print example_row print example_row[0].ctype print example_row[0].value print ctype_text types = [] for v in example_row: value_type = ctype_text[v.ctype] if value_type == 'text': types.append(text_type) elif value_type == 'number':
print(impressions) return None, None INSERT_QUERY = "INSERT INTO creative_stats ({}) VALUES ({}) ON CONFLICT (ad_id) DO UPDATE SET {}".format(', '.join([k for k in KEYS]), ', '.join([":" + k for k in KEYS]), ', '.join([f"{k} = :{k}" for k in KEYS])) def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] # specifying column types saves 50% of time in loading the CSV! (30min w/o, 15min w/) CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 'Ad_Campaigns_List': agate.Boolean(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 'First_Served_Timestamp': agate.DateTime(), 'Last_Served_Timestamp': agate.DateTime(), 'Age_Targeting': agate.Text(), 'Gender_Targeting': agate.Text(), 'Geo_Targeting_Included': agate.Text(), 'Geo_Targeting_Excluded': agate.Text(), 'Spend_Range_Min_USD': agate.Number(), 'Spend_Range_Max_USD': agate.Number(), 'Spend_Range_Min_EUR': agate.Number(), 'Spend_Range_Max_EUR': agate.Number(), 'Spend_Range_Min_INR': agate.Number(), 'Spend_Range_Max_INR': agate.Number(), 'Spend_Range_Min_BGN': agate.Number(), 'Spend_Range_Max_BGN': agate.Number(), 'Spend_Range_Min_HRK': agate.Number(), 'Spend_Range_Max_HRK': agate.Number(), 'Spend_Range_Min_CZK': agate.Number(), 'Spend_Range_Max_CZK': agate.Number(), 'Spend_Range_Min_DKK': agate.Number(), 'Spend_Range_Max_DKK': agate.Number(), 'Spend_Range_Min_HUF': agate.Number(), 'Spend_Range_Max_HUF': agate.Number(), 'Spend_Range_Min_PLN': agate.Number(), 'Spend_Range_Max_PLN': agate.Number(), 'Spend_Range_Min_RON': agate.Number(), 'Spend_Range_Max_RON': agate.Number(), 'Spend_Range_Min_SEK': agate.Number(), 'Spend_Range_Max_SEK': agate.Number(), 'Spend_Range_Min_GBP': agate.Number(), 'Spend_Range_Max_GBP': agate.Number(), 'Spend_Range_Min_NZD': agate.Number(), 'Spend_Range_Max_NZD': agate.Number()} OLD_CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 'Ad_Campaigns_List': agate.Text(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), } CREATIVE_STATS_SCHEMA_CHANGE_DATE = date(2020, 7, 1) # it's sometime around here, I don't know for sure, that the schema changes
# Name: Shani Kumar # Date: 02/23/2020 # Course: DSC-540 - Data Preparation # Desc: Practice joining numerous datasets – an activity you will likely run into frequently. Following the example # in your text that starts on page 229 – 233 of Data Wrangling with Python, work through the example to bring # two datasets together. # Usage: This program is to complete assignment 11.2 requirements # # Import required packages import xlrd import agate from xlrd.sheet import ctype_text text_type = agate.Text() # define text type number_type = agate.Number() # define number type boolean_type = agate.Boolean() # define boolean type date_type = agate.Date() # define date type def remove_bad_chars(val): """ This method remove bad character from data. If it is '-' it returns none :param val: input string data :return: input string or none """ if val == '-': return None return val def get_types(example_row): """
#!/usr/bin/env python """ This module contains the XLS extension to :class:`Table <agate.table.Table>`. """ import datetime from collections import OrderedDict import agate import six import xlrd EXCEL_TO_AGATE_TYPE = { xlrd.biffh.XL_CELL_EMPTY: agate.Boolean(), xlrd.biffh.XL_CELL_TEXT: agate.Text(), xlrd.biffh.XL_CELL_NUMBER: agate.Number(), xlrd.biffh.XL_CELL_DATE: agate.DateTime(), xlrd.biffh.XL_CELL_BOOLEAN: agate.Boolean(), xlrd.biffh.XL_CELL_ERROR: agate.Text(), xlrd.biffh.XL_CELL_BLANK: agate.Boolean(), } def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, **kwargs): """
'New Zealand', 'Norway', 'Poland', 'Portugal', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Turkey', 'UK', 'USA' ] NULL_VALUES = ['', '-999'] boolean = agate.Boolean(null_values=NULL_VALUES) number = agate.Number(null_values=NULL_VALUES) text = agate.Text(null_values=NULL_VALUES) def load_data(data): tester = agate.TypeTester(types=[ boolean, number, text ]) data['dpi'] = agate.Table.from_csv('DPI2015_basefile.v5.csv', column_types=tester) def add_value(data):
def get_computed_data_type(self, table): return agate.Boolean()