def setUp(self): self.rows = ( (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types) self.connection_string = 'sqlite:///:memory:'
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} text_type = agate.Text(**type_kwargs) if self.args.no_inference: types = [text_type] else: number_type = agate.Number(locale=self.args.locale, **type_kwargs) # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), text_type, ] # In order to parse dates like "20010101". if self.args.date_format or self.args.datetime_format: types.insert(-1, number_type) else: types.insert(1, number_type) return agate.TypeTester(types=types)
def setUp(self): self.rows = ( (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.user_provided_column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def from_sql(cls, connection_or_string, table_name): """ Create a new :class:`agate.Table` from a given SQL table. Types will be inferred from the database schema. Monkey patched as class method :meth:`Table.from_sql`. :param connection_or_string: An existing sqlalchemy connection or connection string. :param table_name: The name of a table in the referenced database. """ engine, connection = get_engine_and_connection(connection_or_string) metadata = MetaData(connection) sql_table = Table(table_name, metadata, autoload=True, autoload_with=connection) column_names = [] column_types = [] for sql_column in sql_table.columns: column_names.append(sql_column.name) if type(sql_column.type) in INTERVAL_MAP.values(): py_type = datetime.timedelta else: py_type = sql_column.type.python_type if py_type in [int, float, decimal.Decimal]: if py_type is float: sql_column.type.asdecimal = True column_types.append(agate.Number()) elif py_type is bool: column_types.append(agate.Boolean()) elif issubclass(py_type, six.string_types): column_types.append(agate.Text()) elif py_type is datetime.date: column_types.append(agate.Date()) elif py_type is datetime.datetime: column_types.append(agate.DateTime()) elif py_type is datetime.timedelta: column_types.append(agate.TimeDelta()) else: raise ValueError('Unsupported sqlalchemy column type: %s' % type(sql_column.type)) s = select([sql_table]) rows = connection.execute(s) try: return agate.Table(rows, column_names, column_types) finally: if engine is not None: connection.close() engine.dispose()
def get_types(example_row): types = [] for v in example_row: value_type = xlrd.sheet.ctype_text[v.ctype] if value_type == 'text': types.append(agate.Text()) elif value_type == 'number': types.append(agate.Number()) elif value_type == 'xldate': types.append(agate.Date()) else: types.append(agate.Text()) return types
def test_distinct_values(self): column_names: List = [ 'id', 'name', 'dob', 'last seen', 'size', 'active', ] column_types: List = [ agate.Number(), agate.Text(), agate.Date(), agate.DateTime(), agate.Text(), agate.Boolean(), ] rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L', True), (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S', False), (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00', 'M', True), (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00', 'S', True), (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L', True), (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00', 'M', False), (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00', 'M', False), (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00', 'XL', True), (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00', 'L', False), (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M', True)] model = csvhound.core.BaseHound() table = model.get_table_from_file('sample-data/test-distinct.csv') distinct = model.distinct_values('size') agate_table = agate.Table(rows, column_names, column_types) distinct_agate = agate_table.select('size').distinct('size') # now do the testing self.assertColumnNames(distinct, ('size', )) self.assertColumnTypes(distinct, [type(c) for c in distinct.column_types]) self.assertRows(distinct, distinct_agate)
def get_column_types(self): if getattr(self.args, 'blanks', None): text_type = agate.Text(cast_nulls=False) else: text_type = agate.Text() if self.args.no_inference: return agate.TypeTester(types=[text_type]) else: return agate.TypeTester(types=[ agate.Boolean(), agate.Number(locale=self.args.locale), agate.TimeDelta(), agate.Date(date_format=self.args.date_format), agate.DateTime(datetime_format=self.args.datetime_format), text_type ])
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.Number(locale=self.args.locale, **type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), ] + types return agate.TypeTester(types=types)
def setUp(self): self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM', '6:18'), (None, 'b', None, None, None, None)) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', 'timedelta' ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), agate.TimeDelta() ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), # This is a different order than agate's default, in order to parse dates like "20010101". agate.Number(locale=self.args.locale, **type_kwargs), ] + types return agate.TypeTester(types=types)
def setUp(self): self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM', '4h45m'), ('2', 'b', 'False', '11/01/2015', '11/01/2015 12:45 PM', '3h25m'), ('', '', '', '', '', '')) self.number_type = agate.Number() self.text_type = agate.Text() self.boolean_type = agate.Boolean() self.date_type = agate.Date() self.datetime_type = agate.DateTime() self.timedelta_type = agate.TimeDelta() self.column_names = ('number', 'text', 'boolean', 'date', 'datetime', 'timedelta') self.column_types = (self.number_type, self.text_type, self.boolean_type, self.date_type, self.datetime_type, self.timedelta_type) self.table = agate.Table(self.rows, zip(self.column_names, self.column_types))
print(title_rows) titles = [t[0] + ' ' + t[1] for t in title_rows] titles = [t.strip() for t in titles] titles country_rows = [sheet.row_values(r) for r in range(6, 114)] country_rows from xlrd.sheet import ctype_text import agate text_type = agate.Text() number_type = agate.Number() boolean_type = agate.Boolean() date_type = agate.Date() example_row = sheet.row(6) print example_row print example_row[0].ctype print example_row[0].value print ctype_text types = [] for v in example_row: value_type = ctype_text[v.ctype] if value_type == 'text': types.append(text_type) elif value_type == 'number': types.append(number_type)
def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, row_limit=None, column_names=None, column_types=None, **kwargs): """ Parse an XLS file. :param path: Path to an XLS file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the first sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. :param row_limit: Limit how many rows of data will be read :param column_names: See :meth:`.Table.__init__`. :param column_types: See :meth:`.Table.__init__`. """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') def open_workbook(f): try: book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override, on_demand=True) except xlrd.compdoc.CompDocError: # This is not a pure XLS file; we'll try to read it though. # Let's try the Compound File Binary Format: ole = olefile.OleFileIO(f) if ole.exists('Workbook'): d = ole.openstream('Workbook') book = xlrd.open_workbook(file_contents=d.read(), on_demand=True) else: raise IOError('No Workbook stream found in OLE file') return book if hasattr(path, 'read'): book = open_workbook(path) else: with open(path, 'rb') as f: book = open_workbook(f) try: multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): sheet = book.sheet_by_name(sheet) elif isinstance(sheet, int): sheet = book.sheet_by_index(sheet) else: sheet = book.sheet_by_index(0) if header: offset = 1 column_names_detected = [] else: offset = 0 column_names_detected = None columns = [] column_types_detected = [] for i in range(sheet.ncols): if row_limit is None: values = sheet.col_values(i, skip_lines + offset) types = sheet.col_types(i, skip_lines + offset) else: values = sheet.col_values(i, skip_lines + offset, skip_lines + offset + row_limit) types = sheet.col_types(i, skip_lines + offset, skip_lines + offset + row_limit) excel_type = determine_excel_type(types) agate_type = determine_agate_type(excel_type) if excel_type == xlrd.biffh.XL_CELL_BOOLEAN: values = normalize_booleans(values) elif excel_type == xlrd.biffh.XL_CELL_DATE: values, with_date, with_time = normalize_dates( values, book.datemode) if not with_date: agate_type = agate.TimeDelta() if not with_time: agate_type = agate.Date() if header: name = six.text_type(sheet.cell_value(skip_lines, i)) or None column_names_detected.append(name) columns.append(values) column_types_detected.append(agate_type) rows = [] if columns: for i in range(len(columns[0])): rows.append([c[i] for c in columns]) if column_names is None: sheet_column_names = column_names_detected else: sheet_column_names = column_names sheet_column_types = column_types if isinstance(column_types, dict) and sheet_column_names is not None: sheet_column_types = dict( zip(sheet_column_names, column_types_detected)) sheet_column_types.update(column_types) tables[sheet.name] = agate.Table(rows, sheet_column_names, sheet_column_types, **kwargs) finally: book.release_resources() if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]
print(impressions) return None, None INSERT_QUERY = "INSERT INTO creative_stats ({}) VALUES ({}) ON CONFLICT (ad_id) DO UPDATE SET {}".format(', '.join([k for k in KEYS]), ', '.join([":" + k for k in KEYS]), ', '.join([f"{k} = :{k}" for k in KEYS])) def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] # specifying column types saves 50% of time in loading the CSV! (30min w/o, 15min w/) CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 'Ad_Campaigns_List': agate.Boolean(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 'First_Served_Timestamp': agate.DateTime(), 'Last_Served_Timestamp': agate.DateTime(), 'Age_Targeting': agate.Text(), 'Gender_Targeting': agate.Text(), 'Geo_Targeting_Included': agate.Text(), 'Geo_Targeting_Excluded': agate.Text(), 'Spend_Range_Min_USD': agate.Number(), 'Spend_Range_Max_USD': agate.Number(), 'Spend_Range_Min_EUR': agate.Number(), 'Spend_Range_Max_EUR': agate.Number(), 'Spend_Range_Min_INR': agate.Number(), 'Spend_Range_Max_INR': agate.Number(), 'Spend_Range_Min_BGN': agate.Number(), 'Spend_Range_Max_BGN': agate.Number(), 'Spend_Range_Min_HRK': agate.Number(), 'Spend_Range_Max_HRK': agate.Number(), 'Spend_Range_Min_CZK': agate.Number(), 'Spend_Range_Max_CZK': agate.Number(), 'Spend_Range_Min_DKK': agate.Number(), 'Spend_Range_Max_DKK': agate.Number(), 'Spend_Range_Min_HUF': agate.Number(), 'Spend_Range_Max_HUF': agate.Number(), 'Spend_Range_Min_PLN': agate.Number(), 'Spend_Range_Max_PLN': agate.Number(), 'Spend_Range_Min_RON': agate.Number(), 'Spend_Range_Max_RON': agate.Number(), 'Spend_Range_Min_SEK': agate.Number(), 'Spend_Range_Max_SEK': agate.Number(), 'Spend_Range_Min_GBP': agate.Number(), 'Spend_Range_Max_GBP': agate.Number(), 'Spend_Range_Min_NZD': agate.Number(), 'Spend_Range_Max_NZD': agate.Number()} OLD_CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 'Ad_Campaigns_List': agate.Text(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), } CREATIVE_STATS_SCHEMA_CHANGE_DATE = date(2020, 7, 1) # it's sometime around here, I don't know for sure, that the schema changes
# Date: 02/23/2020 # Course: DSC-540 - Data Preparation # Desc: Practice joining numerous datasets – an activity you will likely run into frequently. Following the example # in your text that starts on page 229 – 233 of Data Wrangling with Python, work through the example to bring # two datasets together. # Usage: This program is to complete assignment 11.2 requirements # # Import required packages import xlrd import agate from xlrd.sheet import ctype_text text_type = agate.Text() # define text type number_type = agate.Number() # define number type boolean_type = agate.Boolean() # define boolean type date_type = agate.Date() # define date type def remove_bad_chars(val): """ This method remove bad character from data. If it is '-' it returns none :param val: input string data :return: input string or none """ if val == '-': return None return val def get_types(example_row): """ This routine based on data in a row determines the column type
import re import six MSO_NUMBER_FORMAT_TO_AGATE_TYPE = { r'0': agate.Number(), r'0\.0': agate.Number(), r'0\.00': agate.Number(), r'0\.000': agate.Number(), r'0\.0000': agate.Number(), r'0\.E+00': agate.Number(), r'0%': agate.Number(), r'Percent': agate.Number(), r'\#\ ?\/?': agate.Number(), r'\#\ ??\/??': agate.Number(), r'\#\ ???\/???': agate.Number(), r'Short Date': agate.Date(date_format='%d/%m/%Y'), r'Medium Date': agate.Date(date_format='%d-%b-%y'), r'Long Date': agate.Date(date_format=''), r'Short Time': agate.DateTime(datetime_format='%H:%M'), r'Medium Time': agate.DateTime(datetime_format='%I:%M %p'), r'Long Time': agate.DateTime(datetime_format='%H:%M:%S:%f'), r'\@': agate.Text(), # TODO add mm\/dd\/yy and so on... } def from_html(cls, path, table_identifier=0, header=True, encoding='utf-8',
import agate import agatecharts agatecharts.patch() OUTPUT_DIR = 'docs/samples' if not os.path.exists(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) for filename in os.listdir(OUTPUT_DIR): os.remove(os.path.join(OUTPUT_DIR, filename)) tester = agate.TypeTester(force={ ' Date': agate.Date('%Y-%m-%d') }) emissions = agate.Table.from_csv('examples/epa-emissions-20150910.csv', tester) emissions = emissions.compute([ (agate.Formula(agate.Number(), lambda r: r[' Date'].day), 'day'), (agate.Formula(agate.Number(), lambda r: r[' SO2 (tons)'] or 0), 'so2'), (agate.Formula(agate.Number(), lambda r: r[' NOx (tons)'] or 0), 'noX'), (agate.Formula(agate.Number(), lambda r: r[' CO2 (short tons)'] or 0), 'co2') ]) states = emissions.group_by('State') state_totals = states.aggregate([ ('so2', agate.Sum(), 'so2'), ('co2', agate.Sum(), 'co2'),
def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, **kwargs): """ Parse an XLS file. :param path: Path to an XLS file to load or a file-like object for one. :param sheet: The names or integer indices of the worksheets to load. If not specified then the first sheet will be used. :param skip_lines: The number of rows to skip from the top of the sheet. :param header: If :code:`True`, the first row is assumed to contain column names. """ if not isinstance(skip_lines, int): raise ValueError('skip_lines argument must be an int') if hasattr(path, 'read'): book = xlrd.open_workbook(file_contents=path.read(), encoding_override=encoding_override) else: with open(path, 'rb') as f: book = xlrd.open_workbook(file_contents=f.read(), encoding_override=encoding_override) multiple = agate.utils.issequence(sheet) if multiple: sheets = sheet else: sheets = [sheet] tables = OrderedDict() for i, sheet in enumerate(sheets): if isinstance(sheet, six.string_types): sheet = book.sheet_by_name(sheet) elif isinstance(sheet, int): sheet = book.sheet_by_index(sheet) else: sheet = book.sheet_by_index(0) if header: offset = 1 column_names = [] else: offset = 0 column_names = None columns = [] column_types = [] for i in range(sheet.ncols): data = sheet.col_values(i) values = data[skip_lines + offset:] types = sheet.col_types(i)[skip_lines + offset:] excel_type = determine_excel_type(types) agate_type = determine_agate_type(excel_type) if excel_type == xlrd.biffh.XL_CELL_BOOLEAN: values = normalize_booleans(values) elif excel_type == xlrd.biffh.XL_CELL_DATE: values, with_date, with_time = normalize_dates( values, book.datemode) if not with_date: agate_type = agate.TimeDelta() if not with_time: agate_type = agate.Date() if header: name = six.text_type(data[skip_lines]) or None column_names.append(name) columns.append(values) column_types.append(agate_type) rows = [] if columns: for i in range(len(columns[0])): rows.append([c[i] for c in columns]) if 'column_names' in kwargs: if not header: column_names = kwargs['column_names'] del kwargs['column_names'] if 'column_types' in kwargs: column_types = kwargs['column_types'] del kwargs['column_types'] tables[sheet.name] = agate.Table(rows, column_names, column_types, **kwargs) if multiple: return agate.MappedSequence(tables.values(), tables.keys()) else: return tables.popitem()[1]
# https://gist.github.com/jo-tez/7f0a6bad2cd6731d8db16d9542719edb import agate import numpy.random as npr import isodate from faker import Factory random_groups = npr.choice(3, 100, p=[.25, .65, .10]) column_names = ['id', 'name', 'hire_date', 'pb', 'salary'] column_types = [ agate.Number(), agate.Text(), agate.Date(), agate.Number(), agate.Number() ] def generate_test_data(): # Set seed to generate consistent test data npr.seed(1) data_lists = [] n_recs = 110 fk = Factory.create() for i in range(n_recs): payband = npr.choice([1, 2, 3], p=[0.7, 0.25, 0.05]) payband = int(payband)