def print_breakdown(): for age in ages: for income in incomes: print( '{0} with {1} \n'.format(age, income), 'Rural:', rural.aggregate( agate.Mean( 'Dollar difference for {0} year old with ${1},000 income' .format(age, income))), '\n', 'Small Towns:', small_towns.aggregate( agate.Mean( 'Dollar difference for {0} year old with ${1},000 income' .format(age, income))), '\n', 'Metro:', metro.aggregate( agate.Mean( 'Dollar difference for {0} year old with ${1},000 income' .format(age, income)))) print( 'Trump vote percentage in rural counties:', calculate_trump_pct(rural), '\n', 'Trump vote percentage in small town counties:', calculate_trump_pct(small_towns), '\n', 'Trump vote percentage in metro counties:', calculate_trump_pct(metro), )
def statistics(data): data['statistics'] = data['table'].aggregate([ ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')), ('accidents', agate.Count()), ('mean_accidents', agate.Mean('accidents')), ('mean_killed', agate.Mean('killed')), ('mean_injured', agate.Mean('injured')) ]) return data
def by_index(data): groups = data['oecd'].group_by('year') indices = groups.aggregate([ ('average_herftot', agate.Mean('herftot')) ]) indices.to_csv('by_index.csv')
def st_dev(data): data['st_dev_hour'] = data['hour'].aggregate([ ('st_dev_accidents', agate.StDev('accidents')), ('st_dev_killed', agate.StDev('killed')), ('st_dev_injured', agate.StDev('injured')), ('mean_accidents', agate.Mean('accidents')), ]) return data
def run(self, table): new_column = [] st_dev = table.aggregate(agate.StDev(self._st_dev_column)) mean = table.aggregate(agate.Mean(self._st_dev_column)) deviations_range = range(mean - (self._deviations * st_dev), mean + (self._deviations * st_dev)) for row in table.rows: val = row[self._st_dev_column] if val in deviations_range: new_column.append(True) else: new_column.append(False) return new_column
most_females = table.order_by('Female', reverse=True).limit(10) for r in most_females.rows: print '{}: {}%'.format(r['Countries and areas'], r['Female']) female_data = table.where(lambda r: r['Female'] is not None) most_females = female_data.order_by('Female', reverse=True).limit(10) for r in most_females.rows: print '{}: {}%'.format(r['Countries and areas'], r['Female']) (lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(0) (lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(4) #table.columns['Place of residence (%) Urban'].aggregate(agate.Mean()) col = table.columns['Place of residence (%) Urban'] table.aggregate(agate.Mean('Place of residence (%) Urban')) has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None) has_por.aggregate(agate.Mean('Place of residence (%) Urban')) first_match = has_por.find(lambda x: x['Rural'] > 50) print(first_match['Countries and areas']) ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print row['Total (%)'], row['Total Child Labor Rank'] def reverse_percent(row):
if value_type == 'text': types.append(text_type) elif value_type == 'number': types.append(number_type) else: types.append(text_type) # world_happiness_table = agate.Table(country_hs_rows,titles,types) world_happiness_table.print_table(max_columns=7) country_happiness_scores = world_happiness_table.order_by('Country', reverse=True).limit(10) for country in country_happiness_scores: print(country) happiness_score_mean = world_happiness_table.aggregate(agate.Mean('Happiness Score')) print(happiness_score_mean) country_high_happiness = world_happiness_table.where(lambda r: r['Happiness Score'] > 7) for country in country_high_happiness: print(country) first_country_under_three = world_happiness_table.find(lambda r: r['Happiness Score'] < 3) print(first_country_under_three) #calculate the correlation between Happiness Score and Dystopia Residual correlation = numpy.corrcoef( [float(value) for value in world_happiness_table.columns["Happiness Score"].values()], [float(compare_value) for compare_value in world_happiness_table.columns["Dystopia Residual"].values()] )[0,1] print(correlation)
def means(stats): return stats.aggregate([('update_time_mean', agate.Mean('update_time')), ('license_mean', agate.Mean('license')), ('format_mean', agate.Mean('format'))])
'Countries and areas', inner=True) country_json = json.loads( open('../../data/chp9/earth-cleaned.json', 'rb').read()) for dct in country_json: country_dict[dct['name']] = dct['parent'] cpi_and_cl = cpi_and_cl.compute([ ('continent', agate.Formula(text_type, get_country)), ]) for r in cpi_and_cl.rows: print r['Country / Territory'], r['continent'] grp_by_cont = cpi_and_cl.group_by('continent') grp_by_cont for cont, table in grp_by_cont.items(): print cont, len(table.rows) agg = grp_by_cont.aggregate([('cl_mean', agate.Mean('Total (%)')), ('cl_max', agate.Max('Total (%)')), ('cpi_median', agate.Median('CPI 2013 Score')), ('cpi_min', agate.Min('CPI 2013 Score'))]) agg agg.print_table() agg.print_bars('continent', 'cl_max')
# 출력한 결과를 보면 백분율 값 중 None이 존재한다. # agate 표의 where 메서드를 활용하면 이러한 값들을 제거할 수 있다. female_data = table.where(lambda x: x['Female'] is not None) most_females = female_data.order_by('Female', reverse=True).limit(10) for r in most_females.rows: print(f'{r["Countries and areas"]}: {r["Female"]}%') print() # 도시의 평균 아동 노동률을 구해보자. # 이를 위해서는 Place of residence (%) Urban 열의 평균 값을 구해야 한다. # Null 값을 제외하고 aggregate에 Mean 함수를 넣어 계산해본다. # 이를 이용하여 Min과 Max도 계산이 가능하다. has_por = table.where(lambda x: x['Place of residence (%) Urban'] is not None) print(has_por.aggregate(agate.Mean('Place of residence (%) Urban'))) print() # 지방 아동 노동률이 50% 이상인 행 가운데 하나를 찾아 보자. # 조건을 만족하는 첫 번째 행을 반환한다. first_match = has_por.find(lambda x: x['Rural'] > 50) print(first_match['Countries and areas']) print() # 아동 노동률이 높은 국가의 순위를 알아보자 # 이를 위해서는 Total(%) 열을 기반으로 데이터를 정렬하면 된다. ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows: print(row['Total (%)'], row['Total Child Labor Rank'])
with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'africa_cpi_cl.pickle'), 'rb') as f: africa_cpi_cl = pickle.load(f) plt.plot(africa_cpi_cl.columns['CPI 2013 Score'], africa_cpi_cl.columns['Total (%)']) plt.xlabel('CPI Score - 2013') plt.ylabel('Child Labor Percentage') plt.title('CPI & Child Labor Correlation') plt.show() # 최악의 가해 국가들만 분리하여 데이터 시각화 cl_mean = africa_cpi_cl.aggregate(agate.Mean('Total (%)')) cpi_mean = africa_cpi_cl.aggregate(agate.Mean('CPI 2013 Score')) def highest_rates(row): if row['Total (%)'] > cl_mean and row['CPI 2013 Score'] < cpi_mean: return True return False highest_cpi_cl = africa_cpi_cl.where(lambda x: highest_rates(x)) plt.plot(highest_cpi_cl.columns['CPI 2013 Score'], highest_cpi_cl.columns['Total (%)']) plt.xlabel('CPI Score - 2013')
for r in most_females.rows: print('{}: {}%'.format(r['Countries and areas'], r['Female'])) female_data = table.where(lambda r: r['Female'] is not None) most_females = female_data.order_by('Female', reverse=True).limit(10) for r in most_females.rows: print('{}: {}%'.format(r['Countries and areas'], r['Female'])) try: table.columns['Place of residence (%) Urban'].mean() except: pass has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None) has_por.columns['Place of residence (%) Urban'].aggregate(agate.Mean()) has_por.columns['Place of residence (%) Urban'].aggregate(agate.Max()) has_por.columns['Rural'].aggregate(agate.Mean()) has_por.columns['Rural'].aggregate(agate.Max()) has_por.find(lambda x: x['Rural'] > 50) ranked = table.compute([(agate.Rank('Total (%)', reverse=True), 'Total Child Labor Rank')]) # If we wanted a column showing children not working percentage ... def reverse_percent(row): return 100 - row['Total (%)']
10) # top 10 countries with highest child labor print("\nCountries with highest child labor: ") for r in most_egregious.rows: print("{} - {}".format(r[0], r[1])) # print result female_data = table.where(lambda r: r['Female'] is not None) # get girl data most_females = female_data.order_by('Female', reverse=True).limit( 10) # top 10 countries with highest girl labor print("\nTop 10 countries with highest girl labor:") for r in most_females.rows: print("{} - {}".format(r[0], r['Female'])) # print result has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None ) # remove empty data print("\nAverage % of child labor in cities: {}".format( has_por.aggregate(agate.Mean('Place of residence (%) Urban'))) ) # print result average% of child labour first_match = has_por.find(lambda x: x[ 'Rural'] > 50) # Find a row with more than 50 % of rural child labor print("\nFirst row with more than 50% of rural child labor is - {}".format( first_match['Countries and areas'])) print( "\nList of top 20 worst offenders: " ) # Rank the worst offenders in terms of child labor percentages by country top_ranked = table.compute([ ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)), ]) for r in top_ranked.order_by('Total (%)', reverse=True).limit(20).rows:
#!/usr/bin/env python import agate table = agate.Table.from_csv('data/hmda_lar__smith_county__all_years__filtered.csv', column_types={ 'census_tract_number_string': agate.Text() }) grouped = table.group_by('census_tract_number_string') summarized = grouped.aggregate([ ('count', agate.Count()), ('avg_applicant_income_000s', agate.Mean('applicant_income_000s')), ('avg_loan_amount_000s', agate.Mean('loan_amount_000s')) ]) summarized.to_csv('data/census_tract_summary.csv')
country_json = json.load(f) country_dict = {} for dct in country_json: country_dict[dct['name']] = dct['parent'] cpi_and_cl = cpi_and_cl.compute([('continent', agate.Formula(agate.Text(), get_country))]) grp_by_cont = cpi_and_cl.group_by('continent') print(grp_by_cont) for cont, table in grp_by_cont.items(): print(cont, len(table.rows)) # 눈으로 확인했을 때 아프리카와 아시아가 높은 값을 가지는 것을 확인할 수 있다. # 하지만 이것만으로 데이터에 접근하기엔 쉽지 않다. # 이 때 필요한 것이 집계 메서드이다. # 국민들이 인식하는 정부 부패 및 아동 노동과 관련하여 대륙들이 어떻게 다른지 비교해보자. agg = grp_by_cont.aggregate([ ('cl_mean', agate.Mean('Total (%)')), ('cl_max', agate.Max('Total (%)')), ('cpi_median', agate.Median('CPI 2013 Score')), ('cpi_min', agate.Min('CPI 2013 Score')) ]) agg.print_table() print() agg.print_bars('continent', 'cl_max') with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cpi_and_cl_2.pickle'), 'wb') as f: pickle.dump(cpi_and_cl, f)
rows = [[float(val)] for val in list(df_orig.values)] table = ag.Table(rows, column_names, column_types) #%% print(table.column_names) outliers = table.stdev_outliers(column_name='Max Amount', deviations=3, reject=True) print(len(outliers.rows)) outlier_list = [] for row in outliers.rows: outlier_list.append(float(row['Max Amount'])) print(row['Max Amount']) print("Mean: {}".format(table.aggregate(ag.Mean('Max Amount')))) """ 11 Found: 116.99 116.99 116.99 118.99 100.0 116.99 312.95 310.95 311.95 312.95 69.95 Mean: 12.47918588873812754409769335 """
#%%[markdown] # ### 2) Which countries have the most girls working? #%% female_data = table.where(lambda r: r['Female'] is not None) most_females = female_data.order_by('Female', reverse=True).limit(5) for r in most_females.rows: print('{}: {}%'.format(r['Countries and areas'], r['Female'])) #%%[markdown] # ### 3) What is the average percentage of child labor in cities? #%% has_poor = table.where(lambda r: r['Place of residence (%)Urban'] is not None) avg_percent = has_poor.aggregate(agate.Mean('Place of residence (%)Urban')) print('Average percentage of child labor in cities: {}%'.format( round(avg_percent, 2))) #%%[markdown] # ### 4) Find a row with more than 50% of rural child labor. #%% has_poor = table.where(lambda r: r['Rural'] is not None) first_match = has_poor.find(lambda x: x['Rural'] > 50) print('The first row with > 50% rural child labor is', first_match['Countries and areas']) #%%[markdown] # ### 5) Rank the worst offenders in terms of child labor percentages by country.