def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} text_type = agate.Text(**type_kwargs) if self.args.no_inference: types = [text_type] else: number_type = agate.Number(locale=self.args.locale, **type_kwargs) # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), text_type, ] # In order to parse dates like "20010101". if self.args.date_format or self.args.datetime_format: types.insert(-1, number_type) else: types.insert(1, number_type) return agate.TypeTester(types=types)
def setUp(self): self.rows = ( (1.123, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (2, u'c', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types) self.connection_string = 'sqlite:///:memory:'
def setUp(self): self.rows = ( (1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM'), (None, 'b', None, None, None), ) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.user_provided_column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def from_sql(cls, connection_or_string, table_name): """ Create a new :class:`agate.Table` from a given SQL table. Types will be inferred from the database schema. Monkey patched as class method :meth:`Table.from_sql`. :param connection_or_string: An existing sqlalchemy connection or connection string. :param table_name: The name of a table in the referenced database. """ engine, connection = get_engine_and_connection(connection_or_string) metadata = MetaData(connection) sql_table = Table(table_name, metadata, autoload=True, autoload_with=connection) column_names = [] column_types = [] for sql_column in sql_table.columns: column_names.append(sql_column.name) if type(sql_column.type) in INTERVAL_MAP.values(): py_type = datetime.timedelta else: py_type = sql_column.type.python_type if py_type in [int, float, decimal.Decimal]: if py_type is float: sql_column.type.asdecimal = True column_types.append(agate.Number()) elif py_type is bool: column_types.append(agate.Boolean()) elif issubclass(py_type, six.string_types): column_types.append(agate.Text()) elif py_type is datetime.date: column_types.append(agate.Date()) elif py_type is datetime.datetime: column_types.append(agate.DateTime()) elif py_type is datetime.timedelta: column_types.append(agate.TimeDelta()) else: raise ValueError('Unsupported sqlalchemy column type: %s' % type(sql_column.type)) s = select([sql_table]) rows = connection.execute(s) try: return agate.Table(rows, column_names, column_types) finally: if engine is not None: connection.close() engine.dispose()
def test_distinct_values(self): column_names: List = [ 'id', 'name', 'dob', 'last seen', 'size', 'active', ] column_types: List = [ agate.Number(), agate.Text(), agate.Date(), agate.DateTime(), agate.Text(), agate.Boolean(), ] rows = [(1, 'Alvin Cotton', '03-01-1980', '06-30-2019 12:12:00', 'L', True), (2, 'Usmaan Rojas', '01-12-1978', '06-30-2019 12:12:00', 'S', False), (3, 'Kingston Odling', '04-09-1990', '06-30-2019 12:12:00', 'M', True), (3, 'Pooja Gillespie', '10-07-1985', '06-30-2019 12:12:00', 'S', True), (4, 'Hal Blake', '08-17-1989', '06-30-2019 12:12:00', 'L', True), (5, 'Shannen Blevins', '06-10-1981', '06-30-2019 12:12:00', 'M', False), (5, 'Courteney Weston', '04-23-1992', '06-30-2019 12:12:00', 'M', False), (6, 'Conner Calhoun', '05-16-1977', '06-30-2019 12:12:00', 'XL', True), (7, 'Susie Rasmussen', '02-08-1987', '06-30-2019 12:12:00', 'L', False), (8, 'Cassie Beltran', '12-15-1982', '06-30-2019 12:12:00', 'M', True)] model = csvhound.core.BaseHound() table = model.get_table_from_file('sample-data/test-distinct.csv') distinct = model.distinct_values('size') agate_table = agate.Table(rows, column_names, column_types) distinct_agate = agate_table.select('size').distinct('size') # now do the testing self.assertColumnNames(distinct, ('size', )) self.assertColumnTypes(distinct, [type(c) for c in distinct.column_types]) self.assertRows(distinct, distinct_agate)
def get_column_types(self): if getattr(self.args, 'blanks', None): text_type = agate.Text(cast_nulls=False) else: text_type = agate.Text() if self.args.no_inference: return agate.TypeTester(types=[text_type]) else: return agate.TypeTester(types=[ agate.Boolean(), agate.Number(locale=self.args.locale), agate.TimeDelta(), agate.Date(date_format=self.args.date_format), agate.DateTime(datetime_format=self.args.datetime_format), text_type ])
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.Number(locale=self.args.locale, **type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), ] + types return agate.TypeTester(types=types)
def get_column_types(self): if getattr(self.args, 'blanks', None): type_kwargs = {'null_values': ()} else: type_kwargs = {} types = [agate.Text(**type_kwargs)] if not self.args.no_inference: types = [ agate.Boolean(**type_kwargs), agate.TimeDelta(**type_kwargs), agate.Date(date_format=self.args.date_format, **type_kwargs), agate.DateTime(datetime_format=self.args.datetime_format, **type_kwargs), # This is a different order than agate's default, in order to parse dates like "20010101". agate.Number(locale=self.args.locale, **type_kwargs), ] + types return agate.TypeTester(types=types)
def setUp(self): self.rows = ((1, 'a', True, '11/4/2015', '11/4/2015 12:22 PM', '4:15'), (2, u'👍', False, '11/5/2015', '11/4/2015 12:45 PM', '6:18'), (None, 'b', None, None, None, None)) self.column_names = [ 'number', 'text', 'boolean', 'date', 'datetime', 'timedelta' ] self.column_types = [ agate.Number(), agate.Text(), agate.Boolean(), agate.Date(), agate.DateTime(), agate.TimeDelta() ] self.table = agate.Table(self.rows, self.column_names, self.column_types)
def _analyze_count(self, event): # Average emails per day total = self.table.aggregate([("total", agate.Count())])["total"] total_senders = ( self.table.distinct("fields/from").select("fields/from").aggregate( [("total", agate.Count())])["total"]) if total == 0: first_email_date = "" last_email_date = None else: date_data = self.table.where( lambda row: row["fields/date"] is not None).compute([( "reduce_to_datetime", agate.Formula( agate.DateTime(datetime_format="%Y-%m-%d %H:%M:%S"), lambda row: helpers.reduce_to_datetime(row[ "fields/date"]), ), )]) first_email_date = (date_data.order_by("reduce_to_datetime").limit( 1).columns["fields/date"].values()[0]) last_email_date = (date_data.order_by( "reduce_to_datetime", reverse=True).limit(1).columns["fields/date"].values()[0]) event.set() metrics = [ ["Total emails", total], ["Senders", total_senders], ["First Email Date", first_email_date], ] if last_email_date: date_delta = helpers.convert_date( last_email_date) - helpers.convert_date(first_email_date) avg_email_per_day = total / date_delta.days metrics.append(["Avg. Emails/Day", f"{avg_email_per_day:.2f}"]) print(f"\n\n{helpers.h1_icn} Stats\n") print(termtables.to_string(metrics))
def setUp(self): self.rows = (('1', 'a', 'True', '10/01/2015', '10/01/2015 12:30 PM', '4h45m'), ('2', 'b', 'False', '11/01/2015', '11/01/2015 12:45 PM', '3h25m'), ('', '', '', '', '', '')) self.number_type = agate.Number() self.text_type = agate.Text() self.boolean_type = agate.Boolean() self.date_type = agate.Date() self.datetime_type = agate.DateTime() self.timedelta_type = agate.TimeDelta() self.column_names = ('number', 'text', 'boolean', 'date', 'datetime', 'timedelta') self.column_types = (self.number_type, self.text_type, self.boolean_type, self.date_type, self.datetime_type, self.timedelta_type) self.table = agate.Table(self.rows, zip(self.column_names, self.column_types))
INSERT_QUERY = "INSERT INTO creative_stats ({}) VALUES ({}) ON CONFLICT (ad_id) DO UPDATE SET {}".format(', '.join([k for k in KEYS]), ', '.join([":" + k for k in KEYS]), ', '.join([f"{k} = :{k}" for k in KEYS])) def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] # specifying column types saves 50% of time in loading the CSV! (30min w/o, 15min w/) CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 'Ad_Campaigns_List': agate.Boolean(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), 'First_Served_Timestamp': agate.DateTime(), 'Last_Served_Timestamp': agate.DateTime(), 'Age_Targeting': agate.Text(), 'Gender_Targeting': agate.Text(), 'Geo_Targeting_Included': agate.Text(), 'Geo_Targeting_Excluded': agate.Text(), 'Spend_Range_Min_USD': agate.Number(), 'Spend_Range_Max_USD': agate.Number(), 'Spend_Range_Min_EUR': agate.Number(), 'Spend_Range_Max_EUR': agate.Number(), 'Spend_Range_Min_INR': agate.Number(), 'Spend_Range_Max_INR': agate.Number(), 'Spend_Range_Min_BGN': agate.Number(), 'Spend_Range_Max_BGN': agate.Number(), 'Spend_Range_Min_HRK': agate.Number(), 'Spend_Range_Max_HRK': agate.Number(), 'Spend_Range_Min_CZK': agate.Number(), 'Spend_Range_Max_CZK': agate.Number(), 'Spend_Range_Min_DKK': agate.Number(), 'Spend_Range_Max_DKK': agate.Number(), 'Spend_Range_Min_HUF': agate.Number(), 'Spend_Range_Max_HUF': agate.Number(), 'Spend_Range_Min_PLN': agate.Number(), 'Spend_Range_Max_PLN': agate.Number(), 'Spend_Range_Min_RON': agate.Number(), 'Spend_Range_Max_RON': agate.Number(), 'Spend_Range_Min_SEK': agate.Number(), 'Spend_Range_Max_SEK': agate.Number(), 'Spend_Range_Min_GBP': agate.Number(), 'Spend_Range_Max_GBP': agate.Number(), 'Spend_Range_Min_NZD': agate.Number(), 'Spend_Range_Max_NZD': agate.Number()} OLD_CREATIVE_STATS_COLUMN_TYPES = {'Ad_ID': agate.Text(), 'Ad_URL': agate.Text(), 'Ad_Type': agate.Text(), 'Regions': agate.Text(), 'Advertiser_ID': agate.Text(), 'Advertiser_Name': agate.Text(), 'Ad_Campaigns_List': agate.Text(), 'Date_Range_Start': agate.Date(), 'Date_Range_End': agate.Date(), 'Num_of_Days': agate.Number(), 'Impressions': agate.Text(), 'Spend_USD': agate.Text(), } CREATIVE_STATS_SCHEMA_CHANGE_DATE = date(2020, 7, 1) # it's sometime around here, I don't know for sure, that the schema changes def load_creative_stats_to_db(csvfn, report_date): # should log: duration, total rows; New today; here yesterday, gone today.
MSO_NUMBER_FORMAT_TO_AGATE_TYPE = { r'0': agate.Number(), r'0\.0': agate.Number(), r'0\.00': agate.Number(), r'0\.000': agate.Number(), r'0\.0000': agate.Number(), r'0\.E+00': agate.Number(), r'0%': agate.Number(), r'Percent': agate.Number(), r'\#\ ?\/?': agate.Number(), r'\#\ ??\/??': agate.Number(), r'\#\ ???\/???': agate.Number(), r'Short Date': agate.Date(date_format='%d/%m/%Y'), r'Medium Date': agate.Date(date_format='%d-%b-%y'), r'Long Date': agate.Date(date_format=''), r'Short Time': agate.DateTime(datetime_format='%H:%M'), r'Medium Time': agate.DateTime(datetime_format='%I:%M %p'), r'Long Time': agate.DateTime(datetime_format='%H:%M:%S:%f'), r'\@': agate.Text(), # TODO add mm\/dd\/yy and so on... } def from_html(cls, path, table_identifier=0, header=True, encoding='utf-8', mso_number_formats_override=None, row_limit=None, **kwargs):
""" This module contains the XLS extension to :class:`Table <agate.table.Table>`. """ import datetime from collections import OrderedDict import agate import six import xlrd EXCEL_TO_AGATE_TYPE = { xlrd.biffh.XL_CELL_EMPTY: agate.Boolean(), xlrd.biffh.XL_CELL_TEXT: agate.Text(), xlrd.biffh.XL_CELL_NUMBER: agate.Number(), xlrd.biffh.XL_CELL_DATE: agate.DateTime(), xlrd.biffh.XL_CELL_BOOLEAN: agate.Boolean(), xlrd.biffh.XL_CELL_ERROR: agate.Text(), xlrd.biffh.XL_CELL_BLANK: agate.Boolean(), } def from_xls(cls, path, sheet=None, skip_lines=0, header=True, encoding_override=None, **kwargs): """ Parse an XLS file.
import agate, os, itertools, time, datetime, glob, csv from datetime import date text_type = agate.Text() datetime_type = agate.DateTime() tester = agate.TypeTester(force={'contb_receipt_dt': agate.Text()}) today = date.today() datestamp = str(today.year) + str(today.month) + str(today.day) ky_candidates_file = str( glob.glob('data/csv/process/*ky-candidate-contributions.csv')[0]) ky_candidate_contributions = agate.Table.from_csv(ky_candidates_file, column_types=tester) current_candidate_cmte_ids = ['C00580100', 'C00575795'] #Trump, Donald J. = C00580100 #Sanders, Bernard = C00577130 #Kasich, John R. = C00581876 #Clinton, Hillary Rodham = C00575795 #Cruz, Rafael Edward 'Ted' = C00574624 def candidate_brackets(contributions): #brackets #bracket1 = 200 and under #bracket2 = 200.01 - 499.99 #bracket3 = 500 - 999.99
def candidate_time_charts(): os.remove('app/data/candidate_charts.js') text_type = agate.Text() datetime_type = agate.DateTime() chart_js = open('app/data/candidate_charts.js', 'a') candidate_contribs_with_monthyear = ky_candidate_contributions.compute([ ('month_year', agate.Formula(text_type, lambda r: r['contb_receipt_dt'][-6:])), ('date', agate.Formula( text_type, lambda r: datetime.datetime.strptime( r['contb_receipt_dt'], '%d-%b-%y'))) ]) date_sorted_candidat_contribs = candidate_contribs_with_monthyear.order_by( 'date') restricted_date_candidate_contribs = date_sorted_candidat_contribs.where( lambda r: r['date'] > '2015-02-28 00:00:00') by_candidate_contribs = candidate_contribs_with_monthyear.group_by( 'cand_nm') # We need a list of unique candidates and a list of unique month_years # Then we need to say, for each month_year and each candidate, how many contributions # happened. # We only need to write one label variable for all candidates: # labels = ['FEB-15', 'MAR-15', etc...] # For each candidate, we need: # candidateName_series = [200, 34, 885, 123, etc...] # Get unique list of month_years. # These are our labels. # We'll have to figure out how to sort these month_years = [] for row in restricted_date_candidate_contribs.rows: month_year = row['month_year'] if month_year in month_years: pass else: month_years.append(str(month_year)) # Get unique list of candidates candidates = [] for row in candidate_contribs_with_monthyear.rows: candidate = row['cand_nm'] if candidate in candidates: pass else: candidates.append(candidate) candidate_month_year_groups = by_candidate_contribs.group_by( lambda r: r['month_year'], key_name='month_year_group') month_year_counts = candidate_month_year_groups.aggregate([ ('contribution_count', agate.Count()), ('contribution_sum', agate.Sum('contb_receipt_amt')) ]) #month_year_counts.print_table(max_rows=200) chart_js.write('count_labels = ' + str(month_years) + '\n') # For each candidate, each month, we want one value for count and one value for sum # If these values cannot be found in the month_year_counts table, then we should record a 0 for candidate in candidates: count_value_list = [] sum_value_list = [] for month in month_years: contrib_count = 0 contrib_sum = 0 for row in month_year_counts.rows: if row['cand_nm'] == candidate: series_label = candidate.split(',')[0].lower() if month == row['month_year_group']: contrib_count = str(row['contribution_count']) #contrib_count = '{:,f}'.format(row['contribution_count']) contrib_count_dict = {} contrib_count_dict['meta'] = str('Contributions to ' + candidate + ' for ' + month) contrib_count_dict['value'] = contrib_count count_value_list.append(dict(contrib_count_dict)) contrib_sum = str(row['contribution_sum']) #contrib_sum = '${:,.2f}'.format(row['contribution_sum']) contrib_sum_dict = {} contrib_sum_dict['meta'] = str('Amt. contributed to ' + candidate + ' for ' + month) contrib_sum_dict['value'] = contrib_sum sum_value_list.append(dict(contrib_sum_dict)) else: pass if contrib_count == 0: contrib_count_dict = {} contrib_count_dict['meta'] = str('Contributions to ' + candidate + ' for ' + month) contrib_count_dict['value'] = '0' count_value_list.append(dict(contrib_count_dict)) if contrib_sum == 0: contrib_sum_dict = {} contrib_sum_dict['meta'] = str('Amount contributed to ' + candidate + ' for ' + month) contrib_sum_dict['value'] = '0' sum_value_list.append(dict(contrib_sum_dict)) chart_js.write(series_label + '_count_series = ' + str(count_value_list) + '\n') chart_js.write(series_label + '_sum_series = ' + str(sum_value_list) + '\n') chart_js.close()