def subset(data): subset = data['table'].where( lambda r: r['origin'] in SELECTED_COUNTRIES and r['year'] >= 1980) groups = subset.group_by(lambda r: '/'.join([str(r['year']), r['origin']]), key_name='year_and_origin') refugees = groups.aggregate([ ('refugees', agate.Sum('refugees')), ('asylum_seekers', agate.Sum('asylum_seekers')), ('returned_refugees', agate.Sum('returned_refugees')), ('idps', agate.Sum('idps')), ('returned_idps', agate.Sum('returned_idps')), ('stateless_persons', agate.Sum('stateless_persons')), ('others', agate.Sum('others')), ('total', agate.Sum('total')) ]).order_by('year_and_origin', reverse=True) refugees = refugees.compute([ ('year', agate.Formula(agate.Text(), lambda r: r['year_and_origin'].split('/')[0])), ('origin', agate.Formula(agate.Text(), lambda r: r['year_and_origin'].split('/')[1])) ]) refugees = refugees.select([ 'origin', 'year', 'refugees', 'asylum_seekers', 'idps', 'returned_idps', 'stateless_persons', 'others', 'total' ]) refugees.to_csv('subset.csv') refugees.pivot( 'year', 'origin', agate.Sum('total')).order_by('year').to_csv('subset_pivot.csv')
def _analyze_date(self, event): table = self.table.where( lambda row: row["fields/date"] is not None).compute([ ( "reduce_to_date", agate.Formula( agate.Text(), lambda row: helpers.reduce_to_date(row["fields/date"]), ), ), ( "reduce_to_year", agate.Formula( agate.Number(), lambda row: helpers.reduce_to_year(row["fields/date"]), ), ), ( "reduce_to_time", agate.Formula( agate.Number(), lambda row: helpers.reduce_to_time(row["fields/date"]), ), ), ]) years = table.distinct( "reduce_to_year").columns["reduce_to_year"].values() _data = {} for year in years: _data[year] = (table.where(lambda row: row[ "reduce_to_year"] == year).select("reduce_to_date").pivot( "reduce_to_date").order_by("reduce_to_date")) event.set() print(f"\n\n{helpers.h1_icn} Date\n") for year in years: data_keys = list(_data[year].columns["reduce_to_date"].values()) _counts = list( map(int, list(_data[year].columns["Count"].values()))) _sum = sum(_counts) data_count = [[i] for i in _counts] args = { "color": False, "custom_tick": False, "start_dt": f"{year}-01-01" } print(f"\n{helpers.h2_icn} Year {year} ({_sum:,} emails)\n") calendar_heatmap(data=data_count, args=args, labels=data_keys)
def time_span(): candidate_contribs_with_monthyear = ky_candidate_contributions.compute([ ('month_year', agate.Formula(text_type, lambda r: r['contb_receipt_dt'][-6:])), ('date', agate.Formula( text_type, lambda r: datetime.datetime.strptime( r['contb_receipt_dt'], '%d-%b-%y'))) ]) candidate_contribs_with_monthyear.to_csv('data/csv/process/' + datestamp + 'ky-contribs-with-monthyear.csv')
def add_value(data): key = { 0: 'no data', 1: 'right', 2: 'center', 3: 'left', None: 'no executive' } data['value_added'] = data['dpi'].compute([ ('alignment', agate.Formula(text, lambda r: key[r['execrlc']])), ('any_nationalist', agate.Formula(boolean, lambda r: r['gov1nat'] or r['gov2nat'] or r['gov3nat'] or r['opp1nat'])), ('any_regionalist', agate.Formula(boolean, lambda r: bool(r['gov1reg'] or r['gov2reg'] or r['gov3reg'] or r['opp1reg']))) ])
def calc_table(in_csv, out_csv): table = agate.Table.from_csv(in_csv, column_names=column_names, column_types=column_types) table = table.pivot('HIEDUC', 'STAYTOG') table = table.compute([('Total', agate.Formula(agate.Number(), get_total))]) table = table.compute([('Percent agree', agate.Formula(agate.Number(), get_percent_agree))]) table.to_csv(out_csv) return table
def test_grouping(self): exonerations = agate.Table.from_csv( '../../../data/exonerations-20150828.csv') clean_state_data = exonerations.compute( [('federal', agate.Formula(agate.Boolean(), lambda row: row['state'].startswith('F-'))), ('state', agate.Formula( agate.Text(), lambda row: row['state'][2:] if row['state'].startswith('F-') else row['state']))], replace=True) by_state = clean_state_data.group_by('state') state_totals = by_state.aggregate([('count', agate.Count())]) sorted_totals = state_totals.order_by('count', reverse=True) sorted_totals.print_table(max_rows=10)
def add_full_hour_date(data): data['table'] = data['table'].compute([ ('date_hour', agate.Formula( agate.Text(), lambda r: r['date_time'].strftime("%Y-%m-%d %H:00:00"))), ])
def compute_ranks(table): table = table.compute([ ('dataset_rank', agate.Rank('datasets', reverse=True)), ('formats_rank', agate.Rank('format_count', reverse=True)), #('open_formats_rank', agate.Rank('open_formats', reverse=True)), ('last_update_rank', agate.Rank('days_since_last_update')), #('open_datasets_rank', agate.Rank('open_datasets', reverse=True)), ('category_rank', agate.Rank('category_count', reverse=True)), ('category_variance_rank', agate.Rank('category_variance')), #('update_start_rank', agate.Rank('days_between_start_and_last_update')), #('start_rank', agate.Rank('days_since_start', reverse=True)), #('openess_score', agate.Formula(number, openness_score)), ('dataset_score_rank', agate.Rank('dataset_score', reverse=True)), ('category_score_rank', agate.Rank('category_score', reverse=True)), ]) table = table.compute([ ('dataset_rank_std', StandadizeScore('dataset_rank')), ('formats_rank_std', StandadizeScore('formats_rank')), ('last_update_rank_std', StandadizeScore('last_update_rank')), ('category_rank_std', StandadizeScore('category_rank')), ('category_variance_rank_std', StandadizeScore('category_variance_rank')), ('dataset_score_rank_std', StandadizeScore('dataset_score_rank')), ('category_score_rank_std', StandadizeScore('category_score_rank')), ]) table = table.compute([ ('overall_rank_data', agate.Formula(agate.Number(), overall_rank)) ]) table = table.compute([ ('overall_rank', agate.Rank('overall_rank_data')), ]) return table
def _overall_stats(self): count_open_licenses = agate.Summary( 'license_id', agate.Number(), lambda r: sum(license_id in utils.OPEN_LICENSES for license_id in r.values())) self.overall_package_stats = self._package_table().aggregate([ ('open_data_count', count_open_licenses), ]) self.resource_stats = self._package_resource_table().compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)), ]) if len(self._package_resource_table()) > 0: self.resource_stats = self.resource_stats.aggregate([ ('open_format_count', agate.Count('open_format', True)), ('min_date', agate.Min('created')), ('max_date', agate.Max('created')) ]) format_table = self._package_resource_table().group_by( "format").aggregate([ ('count', agate.Count()), ]) count = format_table.aggregate([ ('different_formats', agate.Count()), ]) self.open_datasets = self.overall_package_stats.get( "open_data_count", 0) self.open_format_count = self.resource_stats.get( "open_format_count", 0) self.format_count = count.get("different_formats", 0) self.compute_dates()
def _add_random_column(data_tbl): # Reset seed to produce random numbers npr.seed() new_table = data_tbl.compute([('random_group', agate.Formula(agate.Number(), _generate_random))]) return new_table
def monthly_contrib_count(): candidate_contribs_with_monthyear = ky_candidate_contributions.compute([ ('month_year', agate.Formula(text_type, lambda r: r['contb_receipt_dt'][-6:])), ('date', agate.Formula( text_type, lambda r: datetime.datetime.strptime( r['contb_receipt_dt'], '%d-%b-%y'))) ]) july16_contributions = candidate_contribs_with_monthyear.where( lambda r: r['month_year'] == 'JUL-16') positive_contributions = july16_contributions.where( lambda r: r['contb_receipt_amt'] > 0) candidate_brackets(july16_contributions)
def get_org_format_aggregates(self, package_table): # format can not exist!?! format_table = package_table.group_by("format").aggregate([ ('count', agate.Count()), ]) open_format_table = package_table.compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)) ]) open_format_table_aggregates = open_format_table.aggregate([ ('open_formats', agate.Count('open_format', True)), ]) new_table = format_table.compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)) ]) count = new_table.aggregate([ ('different_formats', agate.Count()), ('open_formats', agate.Count('open_format', True)), ]) count["open_formats_datasets"] = open_format_table_aggregates[ "open_formats"] return count
def compute_ranks(self): self.orgs_table = self.orgs_table.compute([ ('dataset_rank', agate.Rank('datasets')), ('formats_rank', agate.Rank('format_count')), ('open_formats_rank', agate.Rank('datasets')), ('last_update_rank', agate.Rank('days_since_last_update')), ('open_datasets_rank', agate.Rank('open_datasets')), ]) self.orgs_table = self.orgs_table.compute([ ('overall_rank_data', agate.Formula(number, overall_rank)), ]) self.orgs_table = self.orgs_table.compute([ ('overall_rank', agate.Rank('overall_rank_data', reverse=True)), ])
def sum_counts_by_hour(data): data['hour'] = data['table'].group_by('hour').aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()), ('accidents_injured', count_accidents_injured) ]).compute([ ('killed_percent', agate.Percent('killed')), ('injured_percent', agate.Percent('injured')), ('accidents_percent', agate.Percent('accidents')), ]).compute([ ('weighted', agate.Formula(agate.Number(), lambda r: r['killed_percent'] + r['injured_percent'])), ('accidents_within_half_deviation', StandardDeviations('accidents', 0.5)), ('killed_within_half_deviation', StandardDeviations('killed', 0.5)), ('injured_within_half_deviation', StandardDeviations('injured', 0.5)) ]) return data
def update_where(self, update_col, update_val, test_col, test_val): # check the types of the update and test columns. colnames = self.column_names types = [ self.column_types[colnames.index(update_col)], self.column_types[colnames.index(test_col)] ] # start the magic self = self.join(self.select([update_col, test_col]) \ .rename(column_names = {update_col: 'update_col', test_col: 'test_col'}) ) self = self.join( agate.Table([[update_val, test_val]] * len(self.rows), ['update_val', 'test_val'], types)) self = self.compute([('updated', agate.Formula(agate.Number(), update_where_function))]) self = self.rename(column_names = {update_col: 'old', 'updated': update_col}) \ .exclude(['old', 'update_col', 'test_col', 'update_val', 'test_val']) return self
def _analyze_count(self, event): # Average emails per day total = self.table.aggregate([("total", agate.Count())])["total"] total_senders = ( self.table.distinct("fields/from").select("fields/from").aggregate( [("total", agate.Count())])["total"]) if total == 0: first_email_date = "" last_email_date = None else: date_data = self.table.where( lambda row: row["fields/date"] is not None).compute([( "reduce_to_datetime", agate.Formula( agate.DateTime(datetime_format="%Y-%m-%d %H:%M:%S"), lambda row: helpers.reduce_to_datetime(row[ "fields/date"]), ), )]) first_email_date = (date_data.order_by("reduce_to_datetime").limit( 1).columns["fields/date"].values()[0]) last_email_date = (date_data.order_by( "reduce_to_datetime", reverse=True).limit(1).columns["fields/date"].values()[0]) event.set() metrics = [ ["Total emails", total], ["Senders", total_senders], ["First Email Date", first_email_date], ] if last_email_date: date_delta = helpers.convert_date( last_email_date) - helpers.convert_date(first_email_date) avg_email_per_day = total / date_delta.days metrics.append(["Avg. Emails/Day", f"{avg_email_per_day:.2f}"]) print(f"\n\n{helpers.h1_icn} Stats\n") print(termtables.to_string(metrics))
def to_and_from(data): refugees = data['table'].select( ['origin', 'residence', 'year', 'refugees']) by_year = refugees.group_by('year') by_origin = (by_year.group_by('origin').aggregate([ ('origin_refugees', agate.Sum('refugees')) ])) by_residence = (by_year.group_by('residence').aggregate([ ('residence_refugees', agate.Sum('refugees')) ])) def comparison(r): origin = r['origin_refugees'] residence = r['residence_refugees'] if not origin: return None if not residence: return None return 1 - (abs(origin - residence) / (origin + residence)) joined = (by_origin.join(by_residence, lambda r: (r['year'], r['origin']), lambda r: (r['year'], r['residence'])).exclude([ 'residence', 'year2' ]).rename(column_names={ 'origin': 'country' }).compute([ ('ratio', agate.Formula(agate.Number(), comparison)) ])) joined.to_csv('joined.csv')
def main(): table = agate.Table.from_csv('data.csv') print(list(zip(table.column_names, table.column_types))) with_decimals = table.compute([ ('year', agate.Formula(agate.Text(), lambda r: r['Date'].year)), ('lat', DecimalDegrees('Latitude')), ('lng', DecimalDegrees('Longitude')), ]) with_decimals.to_csv('with_decimals.csv') by_year = with_decimals.group_by('year') by_usable = by_year.group_by( lambda r: r['lat'] is not None and r['lng'] is not None) by_year.aggregate([('count', agate.Length())]).print_csv() by_usable.aggregate([('count', agate.Length())]).print_table() data = {} for year, table in by_year.items(): data[year] = [] for row in table.rows: if not row['lat'] or not row['lng']: continue data[year].append( geojson.Point([float(row['lng']), float(row['lat'])])) with open('src/data/attacks.json', 'w') as f: geojson.dump(data, f, sort_keys=True)
has_por.columns['Rural'].aggregate(agate.Mean()) has_por.columns['Rural'].aggregate(agate.Max()) has_por.find(lambda x: x['Rural'] > 50) ranked = table.compute([(agate.Rank('Total (%)', reverse=True), 'Total Child Labor Rank')]) # If we wanted a column showing children not working percentage ... def reverse_percent(row): return 100 - row['Total (%)'] table = table.compute([(agate.Formula(number_type, reverse_percent), 'Children not working (%)')]) # some investigation into other possible connections hiv_workbook = xlrd.open_workbook(DATA_FOLDER + 'hiv_aids_2014.xlsx') hiv_workbook.sheet_names() children = hiv_workbook.sheet_by_name('T6_CABA') hiv_titles = zip(children.row_values(3), children.row_values(4), children.row_values(5)) hiv_titles = [' '.join(list(t)) for t in hiv_titles] hiv_titles = [t.strip() for t in hiv_titles] for row in range(0, children.nrows): print(row, children.row(row))
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print(row['Total Child Labor Rank'], '\t', row['Countries and areas'], row['Total (%)']) #%%[markdown] # ### 6) Calculate the percentage of children not involved in child labor. #%% def reverse_percent(row): return 100 - row['Total (%)'] ranked = table.compute([ ('Children not working (%)', agate.Formula(number_type, reverse_percent)), ]) ranked = ranked.compute([ ('Total Child Labor Rank', agate.Rank('Children not working (%)')), ]) percent_not_working = ranked.aggregate(agate.Mean('Children not working (%)')) print( 'The total % of children not working is {}%'.format( round(percent_not_working, 2)), '\n') print('The best countries for children not working are:') for row in ranked.order_by('Total (%)', reverse=False).limit(20).rows: print(row['Countries and areas'], row['Total (%)'], row['Children not working (%)'])
def add_year_column(data): data['table'] = data['table'].compute([ ('year', agate.Formula(agate.Text(), lambda r: r['date_time'].strftime("%Y"))), ])
# 손실 자료만 뽑아서 확인해보면 매칭되지 않은 국가는 많지 않다. # earth.json 파일을 클리닝 하는 것을 권장한다. # no_continent = cpi_and_cl.where(lambda x: x['continent'] is None) # # for r in no_continent.rows: # print(r['Country / Territory']) # 클리닝된 대륙 데이터 with open(os.path.join(data_dir, 'chp9', 'earth-cleaned.json'), 'r') as f: country_json = json.load(f) country_dict = {} for dct in country_json: country_dict[dct['name']] = dct['parent'] cpi_and_cl = cpi_and_cl.compute([('continent', agate.Formula(agate.Text(), get_country))]) grp_by_cont = cpi_and_cl.group_by('continent') print(grp_by_cont) for cont, table in grp_by_cont.items(): print(cont, len(table.rows)) # 눈으로 확인했을 때 아프리카와 아시아가 높은 값을 가지는 것을 확인할 수 있다. # 하지만 이것만으로 데이터에 접근하기엔 쉽지 않다. # 이 때 필요한 것이 집계 메서드이다. # 국민들이 인식하는 정부 부패 및 아동 노동과 관련하여 대륙들이 어떻게 다른지 비교해보자. agg = grp_by_cont.aggregate([ ('cl_mean', agate.Mean('Total (%)')), ('cl_max', agate.Max('Total (%)')), ('cpi_median', agate.Median('CPI 2013 Score')),
first_match = has_por.find(lambda x: x['Rural'] > 50) print(first_match['Countries and areas']) ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print row['Total (%)'], row['Total Child Labor Rank'] def reverse_percent(row): return 100 - row['Total (%)'] ranked = table.compute([ ('Children not working (%)', agate.Formula(number_type, reverse_percent)), ]) ranked = ranked.compute([ ('Total Child Labor Rank', agate.Rank('Children not working (%)')), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print row['Total (%)'], row['Total Child Labor Rank'] -------3 / 5 import agate import xlrd from xlrd.sheet import ctype_text
OUTPUT_DIR = 'docs/samples' if not os.path.exists(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) for filename in os.listdir(OUTPUT_DIR): os.remove(os.path.join(OUTPUT_DIR, filename)) tester = agate.TypeTester(force={ ' Date': agate.Date('%Y-%m-%d') }) emissions = agate.Table.from_csv('examples/epa-emissions-20150910.csv', tester) emissions = emissions.compute([ (agate.Formula(agate.Number(), lambda r: r[' Date'].day), 'day'), (agate.Formula(agate.Number(), lambda r: r[' SO2 (tons)'] or 0), 'so2'), (agate.Formula(agate.Number(), lambda r: r[' NOx (tons)'] or 0), 'noX'), (agate.Formula(agate.Number(), lambda r: r[' CO2 (short tons)'] or 0), 'co2') ]) states = emissions.group_by('State') state_totals = states.aggregate([ ('so2', agate.Sum(), 'so2'), ('co2', agate.Sum(), 'co2'), ('noX', agate.Sum(), 'noX') ]) new_york = states['NY'] # NB: key_type shouldn't be necessary--agate bug #234
def percentages(column, precision=2): return (column, agate.Formula(agate.Text(), lambda r: ('%%.%df' % precision) % (r[column] * 100)))
import json import agate country_json = json.loads(open('../../data/chp9/earth.json', 'rb').read()) country_dict = {} for dct in country_json: country_dict[dct['name']] = dct['parent'] def get_country(country_row): return country_dict.get(country_row['Country / Territory'].lower()) cpi_and_cl = cpi_and_cl.compute([ ('continent', agate.Formula(text_type, get_country)), ]) cpi_and_cl.column_names for r in cpi_and_cl.rows: print r['Country / Territory'], r['continent'] no_continent = cpi_and_cl.where(lambda x: x['continent'] is None) for r in no_continent.rows: print r['Country / Territory'] cpi_and_cl = cpi_table.join(ranked, 'Country / Territory', 'Countries and areas', inner=True) country_json = json.loads(
print() # 아동 노동률이 높은 국가의 순위를 알아보자 # 이를 위해서는 Total(%) 열을 기반으로 데이터를 정렬하면 된다. ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print(row['Total (%)'], row['Total Child Labor Rank']) print() # reverse를 사용하지 않고 오름차순 정렬을 하고 싶다면 역 백분율을 기준으로 열을 생성하면 된다. def reverse_percent(row): return 100 - row['Total (%)'] ranked = table.compute([('Children not working (%)', agate.Formula(agate.Number(), reverse_percent))]) ranked = ranked.compute([ ('Total Child Labor Rank', agate.Rank('Children not working (%)')), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print(row['Total (%)'], row['Total Child Labor Rank']) with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ranked.pickle'), 'wb') as f: pickle.dump(ranked, f)
# Read in the courses table course_params = agate.Table.from_csv(file_courses) #~ #~ STEP 1 #~ #~ Create a shortlist of courses by using a popularity ranking based on preferences. #~ # count the preference positions ("preference matrix") for each course pref_count = prefs_n.pivot('course', 'preference') # calculate a course popularity index and rank pref_count = pref_count \ .compute([ ('pop', agate.Formula(agate.Number(), popularity_function)) ]) \ .compute([ ('rank', agate.Rank('pop', reverse=True)) ]) \ .order_by('rank') # reorder the preference columns pref_count_na = list( pref_count.exclude(['course', 'pop', 'rank']).column_names) pref_count_na.sort() pref_count_na = ['rank', 'pop', 'course'] + pref_count_na pref_count = pref_count.select(pref_count_na) #output pref_count.to_csv(outputdir + 'longlist.csv')
# basic stats by_game = data.group_by('game_id') game_count = by_game.aggregate([('count', agate.Count())]) num_games = len(game_count.rows) num_calls = len(data.rows) num_calls_incorrect = len(incorrect.rows) percent_incorrect = float(num_calls_incorrect) / float(num_calls) * 100.0 num_calls_incorrect_per_game = float(num_calls_incorrect) / float(num_games) # calls incorrect per minute played data_with_quarter = data.compute([ ('quarter', agate.Formula(agate.Number(), lambda r: int(r['period'].replace('Q', '')))), ('quarter_id', agate.Formula(agate.Text(), lambda r: r['game_id'] + '-' + r['period'])) ]) # find total minutes played unique_quarters = data_with_quarter.distinct('quarter_id').select(['quarter']) def getMinutes(row): if (row['quarter'] == 4): return 2 return 5 quarter_minutes = unique_quarters.compute([
# 计算Place of residence (%) Urban列的平均值 # table.aggregate(agate.Mean('Place of residence (%) Urban')) # has_por = table.where(lambda r: r['Place of residence (%)Urban'] is not None) # has_por.aggregate(agate.Mean('Place of residence (%)Urban')) # 得出每行数据中农村童工雇佣率大于50%的数据 # first_match = has_por.find(lambda x: x['Rural'] > 50) # print(first_match['Countries and areas']) # 使用Total列数据对童工雇佣率最高的数据进行排序,从高到低 # ranked = table.compute([("Total Child Labor Rank", agate.Rank('Total (%)', reverse=True))]) # for row in ranked.order_by('Total (%)', reverse=True,).limit(20).rows: # print(row['Total (%)'], row['Countries and areas']) # 普通儿童占比 ranked = table.compute([("Children not working (%)", agate.Formula(number_type, reverse_percent))]) ranked = ranked.compute([("Total Child Labor Rank", agate.Rank("Children not working (%)"))]) # for row in ranked.order_by('Total (%)', reverse=True,).limit(20).rows: # print(row['Total (%)'], row['Countries and areas']) # 国际公开腐败感指数处理 cpi_workbook = xlrd.open_workbook('data/chp9/corruption_perception_index.xls') cpi_sheet = cpi_workbook.sheets()[0] # for r in range(cpi_sheet.nrows): # print(r, cpi_sheet.row_values(r)) cpi_title_rows = zip(cpi_sheet.row_values(1), cpi_sheet.row_values(2)) cpi_titles = [t[0] + ' ' + t[1] for t in cpi_title_rows] cpi_titles = [t.strip() for t in cpi_titles] cpi_titles[0] = cpi_titles[0] + " Duplicate"