def ky_by_candidate(): generated_js.write('candidate_contributions = [') current_cand_ky_contrib = ky_candidate_contributions.where( lambda r: r['cmte_id'] in current_candidate_cmte_ids) current_candidate_groups = current_cand_ky_contrib.group_by('cand_nm') current_candidate_totals = current_candidate_groups.aggregate([ ('contributions_count', agate.Count()), ('contributions_sum', agate.Sum('contb_receipt_amt')) ]) sorted_current_candidate_totals = current_candidate_totals.order_by( 'contributions_sum', reverse=True) for row in sorted_current_candidate_totals.rows: generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) + ', sum: ' + str(row[2]) + ', status: "current"},') dropped_cand_ky_contrib = ky_candidate_contributions.where( lambda r: r['cmte_id'] not in current_candidate_cmte_ids) dropped_candidate_groups = dropped_cand_ky_contrib.group_by('cand_nm') dropped_candidate_totals = dropped_candidate_groups.aggregate([ ('contributions_count', agate.Count()), ('contributions_sum', agate.Sum('contb_receipt_amt')) ]) sorted_dropped_candidate_totals = dropped_candidate_totals.order_by( 'contributions_sum', reverse=True) filtered_dropped_candidate_totals = sorted_dropped_candidate_totals.where( lambda r: r['contributions_sum'] > 25000) for row in filtered_dropped_candidate_totals.rows: generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) + ', sum: ' + str(row[2]) + ', status: "dropped"},') generated_js.write(']\n')
def _overall_stats(self): count_open_licenses = agate.Summary( 'license_id', agate.Number(), lambda r: sum(license_id in utils.OPEN_LICENSES for license_id in r.values())) self.overall_package_stats = self._package_table().aggregate([ ('open_data_count', count_open_licenses), ]) self.resource_stats = self._package_resource_table().compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)), ]) if len(self._package_resource_table()) > 0: self.resource_stats = self.resource_stats.aggregate([ ('open_format_count', agate.Count('open_format', True)), ('min_date', agate.Min('created')), ('max_date', agate.Max('created')) ]) format_table = self._package_resource_table().group_by( "format").aggregate([ ('count', agate.Count()), ]) count = format_table.aggregate([ ('different_formats', agate.Count()), ]) self.open_datasets = self.overall_package_stats.get( "open_data_count", 0) self.open_format_count = self.resource_stats.get( "open_format_count", 0) self.format_count = count.get("different_formats", 0) self.compute_dates()
def race_and_age(data): # Filters rows without age data only_with_age = data['with_years_in_prison'].where( lambda r: r['age'] is not None ) # Group by race race_groups = only_with_age.group_by('race') # Sub-group by age cohorts (20s, 30s, etc.) race_and_age_groups = race_groups.group_by( lambda r: '%i0s' % (r['age'] // 10), key_name='age_group' ) # Aggregate medians for each group medians = race_and_age_groups.aggregate([ ('count', agate.Count()), ('median_years_in_prison', agate.Median('years_in_prison')) ]) # Sort the results sorted_groups = medians.order_by('median_years_in_prison', reverse=True) # Print out the results sorted_groups.print_table(max_rows=10)
def year_sum_counts(data): data['groupped_year'] = data['table'].group_by('year').aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()), ('accidents_injured', count_accidents_injured) ]) return data
def test_charting(self): exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv') by_year_exonerated = exonerations.group_by('exonerated') counts = by_year_exonerated.aggregate([('count', agate.Count())]) chart = counts.order_by('exonerated').line_chart('exonerated', 'count', path="line_chart.svg") print(chart)
def states(data): by_state = data['with_years_in_prison'].group_by('state') state_totals = by_state.aggregate([ ('count', agate.Count()) ]) sorted_totals = state_totals.order_by('count', reverse=True) sorted_totals.print_table(max_rows=5) medians = by_state.aggregate([ ('count', agate.Count()), ('median_years_in_prison', agate.Median('years_in_prison')) ]) sorted_medians = medians.order_by('median_years_in_prison', reverse=True) sorted_medians.print_table(max_rows=5)
def distinct_values(self, key=None, with_count=False) -> agate.Table: logger.debug('distinct values for: ' + key) if with_count: table = self._table.group_by(key) # group_by returns a TableSet, so another step is required table = table.aggregate([('count', agate.Count())]) else: table = self._table.select(key).distinct(key) return table
def statistics(data): data['statistics'] = data['table'].aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()), ('mean_accidents', agate.Mean('accidents')), ('mean_killed', agate.Mean('killed')), ('mean_injured', agate.Mean('injured')) ]) return data
def year_police_beat_sum_counts(data): data['year_police_beat'] = data['table'].group_by('year').group_by( 'police_beat').aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()) ]).compute([('weighted_rank', RankWeightedAccidents('year')), ('killed_rank', GroupRanking('killed', 'year')), ('accidents_rank', GroupRanking('accidents', 'year')), ('injured_rank', GroupRanking('injured', 'year'))]) return data
def _analyze_count(self, event): # Average emails per day total = self.table.aggregate([("total", agate.Count())])["total"] total_senders = ( self.table.distinct("fields/from").select("fields/from").aggregate( [("total", agate.Count())])["total"]) if total == 0: first_email_date = "" last_email_date = None else: date_data = self.table.where( lambda row: row["fields/date"] is not None).compute([( "reduce_to_datetime", agate.Formula( agate.DateTime(datetime_format="%Y-%m-%d %H:%M:%S"), lambda row: helpers.reduce_to_datetime(row[ "fields/date"]), ), )]) first_email_date = (date_data.order_by("reduce_to_datetime").limit( 1).columns["fields/date"].values()[0]) last_email_date = (date_data.order_by( "reduce_to_datetime", reverse=True).limit(1).columns["fields/date"].values()[0]) event.set() metrics = [ ["Total emails", total], ["Senders", total_senders], ["First Email Date", first_email_date], ] if last_email_date: date_delta = helpers.convert_date( last_email_date) - helpers.convert_date(first_email_date) avg_email_per_day = total / date_delta.days metrics.append(["Avg. Emails/Day", f"{avg_email_per_day:.2f}"]) print(f"\n\n{helpers.h1_icn} Stats\n") print(termtables.to_string(metrics))
def get_org_format_aggregates(self, package_table): # format can not exist!?! format_table = package_table.group_by("format").aggregate([ ('count', agate.Count()), ]) open_format_table = package_table.compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)) ]) open_format_table_aggregates = open_format_table.aggregate([ ('open_formats', agate.Count('open_format', True)), ]) new_table = format_table.compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)) ]) count = new_table.aggregate([ ('different_formats', agate.Count()), ('open_formats', agate.Count('open_format', True)), ]) count["open_formats_datasets"] = open_format_table_aggregates[ "open_formats"] return count
def get_org_groups_aggregate(self): self.groups = len(set(self.package_groups)) group_table = self._group_table() if (len(group_table) > 0): group_aggregates = group_table.group_by('a').aggregate([ ('count', agate.Count()) ]) if len(group_aggregates) > 0: self.groups_dataset_variance = 1 if len(group_aggregates) > 1: self.groups_dataset_variance = group_aggregates.aggregate( agate.Variance('count')) self.group_score = self.score_for_groups( len(group_aggregates))
def print_ky_current_candidate_sum(): current_cand_ky_contrib = ky_candidate_contributions.where( lambda r: r['cmte_id'] in current_candidate_cmte_ids) ky_current_candidate_count = current_cand_ky_contrib.aggregate( agate.Count()) ky_current_candidate_sum = current_cand_ky_contrib.aggregate( agate.Sum('contb_receipt_amt')) current_candidate_count = len(current_candidate_cmte_ids) print('There are currently ' + str(current_candidate_count) + ' candidates running for president. Those ' + str(current_candidate_count) + ' candidates have received ' + str(ky_current_candidate_count) + ' donations totaling $' + str(ky_current_candidate_sum))
def print_ky_overall_summary(): # How much money has been donated by Kentuckians to the 2016 presidential race? ky_contrib_sum = ky_all_contributions.aggregate( agate.Sum('TRANSACTION_AMT')) # How many contributions have Kentuckians made to the presidential race? ky_contrib_count = ky_all_contributions.aggregate(agate.Count()) print( str(ky_contrib_count) + ' donations, totaling $' + str(ky_contrib_sum) + ' have been donated by Kentuckians to the 2016 presidential race.') generated_js.write('total_donated_sum = ' + str(ky_contrib_sum) + '\ntotal_donated_count = ' + str(ky_contrib_count) + '\n')
def test_grouping(self): exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv') clean_state_data = exonerations.compute( [('federal', agate.Formula(agate.Boolean(), lambda row: row['state'].startswith('F-'))), ('state', agate.Formula( agate.Text(), lambda row: row['state'][2:] if row['state'].startswith('F-') else row['state']))], replace=True) by_state = clean_state_data.group_by('state') state_totals = by_state.aggregate([('count', agate.Count())]) sorted_totals = state_totals.order_by('count', reverse=True) sorted_totals.print_table(max_rows=10)
def top_ky_donors_pac(): contributor_groups = ky_all_contributions.group_by('NAME') contributor_totals = contributor_groups.aggregate([ ('contributions_count', agate.Count()), ('contributions_sum', agate.Sum('TRANSACTION_AMT')) ]) sorted_contributor_totals = contributor_totals.order_by( 'contributions_sum', reverse=True) sorted_contributor_totals.print_table() generated_js.write('top_donors_to_pacs = [') for row in itertools.islice(sorted_contributor_totals.rows, 0, 5): generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) + ', sum: ' + str(row[2]) + '},') generated_js.write(']\n')
def top_ky_donors_candidates(): contributor_groups = ky_candidate_contributions.group_by('contbr_nm') contributor_totals = contributor_groups.aggregate([ ('contributions_count', agate.Count()), ('contributions_sum', agate.Sum('contb_receipt_amt')) ]) sorted_contributor_totals = contributor_totals.order_by( 'contributions_sum', reverse=True) sorted_contributor_totals.print_table() generated_js.write('top_donors_to_candidates = [') for row in itertools.islice(sorted_contributor_totals.rows, 0, 5): generated_js.write('{name: "' + row[0] + '", count: ' + str(row[1]) + ', sum: ' + str(row[2]) + '},') generated_js.write(']\n')
def print_ky_candidate_summary(): # How much money has been donated by Kentuckians to the presidential candidates? ky_candidate_sum = ky_candidate_contributions.aggregate( agate.Sum('contb_receipt_amt')) # How many contributions have Kentuckians made to presidential candidates? ky_candidate_count = ky_candidate_contributions.aggregate(agate.Count()) print( str(ky_candidate_count) + ' donations, totaling $' + str(ky_candidate_sum) + ' have been donated by Kentuckians specifically to the 2016 presidential candidates.' ) generated_js.write('total_candidate_donated_sum = ' + str(ky_candidate_sum) + '\ntotal_candidate_donated_count = ' + str(ky_candidate_count) + '\n')
def sum_counts_by_hour(data): data['hour'] = data['table'].group_by('hour').aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()), ('accidents_injured', count_accidents_injured) ]).compute([ ('killed_percent', agate.Percent('killed')), ('injured_percent', agate.Percent('injured')), ('accidents_percent', agate.Percent('accidents')), ]).compute([ ('weighted', agate.Formula(agate.Number(), lambda r: r['killed_percent'] + r['injured_percent'])), ('accidents_within_half_deviation', StandardDeviations('accidents', 0.5)), ('killed_within_half_deviation', StandardDeviations('killed', 0.5)), ('injured_within_half_deviation', StandardDeviations('injured', 0.5)) ]) return data
def organisations(event, context): table = get_all_orgs() table.to_csv('/tmp/open_data_germany.csv') upload_file_to_s3('open_data_cities.csv','/tmp/open_data_germany.csv') aggregates = table.aggregate([ ('count', agate.Count()), ('sum', agate.Sum('datasets')) ]) with open('/tmp/summary.json', 'w') as f: json.dump(aggregates, f) upload_file_to_s3('open_data_cities_summary.json','/tmp/summary.json') body = { "message": "Go Serverless v1.0! Your function executed successfully!", "input": event } response = { "statusCode": 200, "body": json.dumps(body) } return response
def get_org_groups_aggregate(self, package_data): package_groups = [ group for p in package_data for group in self.get_group_title_from_package(p) ] group_table = agate.Table.from_object(package_groups) result = {} if (len(group_table) > 0): group_aggregates = group_table.group_by('a').aggregate([ ('count', agate.Count()) ]) if len(group_aggregates) > 0: result = {"groups": 1, "groups_dataset_variance": 1} if len(group_aggregates) > 1: result = { "groups": len(set(package_groups)), "groups_dataset_variance": group_aggregates.aggregate(agate.Variance('count')), "group_score": self.score_for_groups(len(group_aggregates)) } return result
def get_open_stats(self, package_data): table = agate.Table.from_object( self.get_open_formats_and_license(package_data)) return table.aggregate([('open_data_count', agate.Count())])
def year_police_beat_sum_counts(data): data['year_police_beat'] = data['table'].group_by('year').group_by( 'police_beat').aggregate([('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count())])
def sum_counts_by_full_hour(data): data['full_hour'] = data['table'].group_by('date_hour').aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()) ])
# tables - now separate by gender youthf = youth.where(lambda row: row['gender'] == 'F') youthm = youth.where(lambda row: row['gender'] == 'M') adultf = adult.where(lambda row: row['gender'] == 'F') adultm = adult.where(lambda row: row['gender'] == 'M') elderlyf = elderly.where(lambda row: row['gender'] == 'F') elderlym = elderly.where(lambda row: row['gender'] == 'M') # I could also use a pivot, but I'm being explicit, for the sake of learning # and ensuring I don't screw up # http://agate.readthedocs.org/en/1.3.0/cookbook/transform.html # aggregate the count for all cmhayouthf = youthf.aggregate(agate.Count('housing_other', 'CMHA')) cmhayouthm = youthm.aggregate(agate.Count('housing_other', 'CMHA')) cmhaadultf = adultf.aggregate(agate.Count('housing_other', 'CMHA')) cmhaadultm = adultm.aggregate(agate.Count('housing_other', 'CMHA')) cmhaelderlyf = elderlyf.aggregate(agate.Count('housing_other', 'CMHA')) cmhaelderlym = elderlym.aggregate(agate.Count('housing_other', 'CMHA')) print("For CMHA clients") print("Youth F clients this month:", (cmhayouthf)) print("Youth M clients this month:", (cmhayouthm)) print("adult F clients this month:", (cmhaadultf)) print("Youth M clients this month:", (cmhayouthm)) print("Elderly F clients this month", (cmhaelderlyf)) print("Elderly M clients this month:", (cmhaelderlym))
#!/usr/bin/env python import agate table = agate.Table.from_csv( 'examples/realdata/Datagov_FY10_EDU_recp_by_State.csv') table.limit(10).bar_chart('State Name', 'TOTAL', 'docs/images/bar_chart.svg') table.limit(10).column_chart('State Name', 'TOTAL', 'docs/images/column_chart.svg') table = agate.Table.from_csv('examples/realdata/exonerations-20150828.csv') by_year_exonerated = table.group_by('exonerated') counts = by_year_exonerated.aggregate([('count', agate.Count())]) counts.order_by('exonerated').line_chart('exonerated', 'count', 'docs/images/line_chart.svg') table.scatterplot('exonerated', 'age', 'docs/images/dots_chart.svg') top_crimes = table.group_by('crime').having([('count', agate.Count())], lambda t: t['count'] > 100) by_year = top_crimes.group_by('exonerated') counts = by_year.aggregate([('count', agate.Count())]) by_crime = counts.group_by('crime') by_crime.order_by('exonerated').line_chart('exonerated', 'count', 'docs/images/lattice.svg')
def confessions(data): num_false_confessions = data['exonerations'].aggregate(agate.Count('false_confession', True)) print('False confessions: %i' % num_false_confessions)
# -*- coding: utf-8 -*- """ Generates basic tables from okc_relevant_data_genbin.csv. """ import agate # create agate table from gender-binary csv data okc_table = agate.Table.from_csv('okc_relevant_data_genbin.csv') # group by gender, returns TableSet by_gender = okc_table.group_by('gender', 'Gender') # count aggregate by_gender, returns table by_gender_count = by_gender.aggregate([('Count', agate.Count())]) # compute by_gender percentages by_gender_count_percent = by_gender_count.compute([('%', agate.Percent('Count'))]) by_gender_count_percent.print_table() """ | Gender | Count | % | | ------ | ------ | ------- | | Woman | 9,235 | 31.972… | | Man | 19,650 | 68.028… | """ # group by orientation, returns TableSet by_orientation = okc_table.group_by('orientation', 'Orientation') # count aggregate by_orientation, returns table by_orientation_count = by_orientation.aggregate([('Count', agate.Count())]) # compute by_orientation percentages by_orientation_count_percent = by_orientation_count.compute([ ('%', agate.Percent('Count'))
ic = incorrect.where(lambda r: r['review_decision'] == 'IC') inc = incorrect.where(lambda r: r['review_decision'] == 'INC') cc = data.where(lambda r: r['review_decision'] == 'CC') cnc = data.where(lambda r: r['review_decision'] == 'CNC') # In[2]: # game data and play data game_include = [ 'game_id', 'away', 'home', 'date', 'ref_1', 'ref_2', 'ref_3', 'score_away', 'score_home', 'original_pdf', 'box_score_url' ] by_game = incorrect.group_by('game_id') by_game_with_count = by_game.aggregate([('incorrect', agate.Count())]) game_data = data.select(game_include).distinct() # merge game_data_merged = game_data.join(by_game_with_count, 'game_id', 'game_id') game_data_merged.to_csv('output/web_games.csv') # In[3]: play_include = [ 'game_id', 'period', 'time', 'seconds_left', 'call_type', 'committing_player', 'disadvantaged_player', 'review_decision', 'comment', 'video', 'committing_team', 'disadvantaged_team', 'play_id', 'ref_made_call' ]
def test_aggregate(self): exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv') print( exonerations.aggregate(agate.Count('false_confession', value=True)))