Example #1
0
def print_breakdown():
    for age in ages:
        for income in incomes:
            print(
                '{0} with {1} \n'.format(age, income), 'Rural:',
                rural.aggregate(
                    agate.Mean(
                        'Dollar difference for {0} year old with ${1},000 income'
                        .format(age, income))), '\n', 'Small Towns:',
                small_towns.aggregate(
                    agate.Mean(
                        'Dollar difference for {0} year old with ${1},000 income'
                        .format(age, income))), '\n', 'Metro:',
                metro.aggregate(
                    agate.Mean(
                        'Dollar difference for {0} year old with ${1},000 income'
                        .format(age, income))))

    print(
        'Trump vote percentage in rural counties:',
        calculate_trump_pct(rural),
        '\n',
        'Trump vote percentage in small town counties:',
        calculate_trump_pct(small_towns),
        '\n',
        'Trump vote percentage in metro counties:',
        calculate_trump_pct(metro),
    )
Example #2
0
def statistics(data):
    data['statistics'] = data['table'].aggregate([
        ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
        ('accidents', agate.Count()),
        ('mean_accidents', agate.Mean('accidents')),
        ('mean_killed', agate.Mean('killed')),
        ('mean_injured', agate.Mean('injured'))
    ])
    return data
Example #3
0
def by_index(data):
    groups = data['oecd'].group_by('year')
    indices = groups.aggregate([
        ('average_herftot', agate.Mean('herftot'))
    ])

    indices.to_csv('by_index.csv')
Example #4
0
def st_dev(data):
    data['st_dev_hour'] = data['hour'].aggregate([
        ('st_dev_accidents', agate.StDev('accidents')),
        ('st_dev_killed', agate.StDev('killed')),
        ('st_dev_injured', agate.StDev('injured')),
        ('mean_accidents', agate.Mean('accidents')),
    ])
    return data
    def run(self, table):
        new_column = []
        st_dev = table.aggregate(agate.StDev(self._st_dev_column))
        mean = table.aggregate(agate.Mean(self._st_dev_column))

        deviations_range = range(mean - (self._deviations * st_dev),
                                 mean + (self._deviations * st_dev))

        for row in table.rows:
            val = row[self._st_dev_column]

            if val in deviations_range:
                new_column.append(True)
            else:
                new_column.append(False)

        return new_column
Example #6
0
most_females = table.order_by('Female', reverse=True).limit(10)
for r in most_females.rows:
    print '{}: {}%'.format(r['Countries and areas'], r['Female'])

female_data = table.where(lambda r: r['Female'] is not None)
most_females = female_data.order_by('Female', reverse=True).limit(10)
for r in most_females.rows:
    print '{}: {}%'.format(r['Countries and areas'], r['Female'])

(lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(0)
(lambda x: 'Positive' if x >= 1 else 'Zero or Negative')(4)

#table.columns['Place of residence (%) Urban'].aggregate(agate.Mean())
col = table.columns['Place of residence (%) Urban']
table.aggregate(agate.Mean('Place of residence (%) Urban'))

has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None)
has_por.aggregate(agate.Mean('Place of residence (%) Urban'))

first_match = has_por.find(lambda x: x['Rural'] > 50)
print(first_match['Countries and areas'])

ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print row['Total (%)'], row['Total Child Labor Rank']


def reverse_percent(row):
	if value_type == 'text':
		types.append(text_type)
	elif value_type == 'number':
		types.append(number_type)
	else:
		types.append(text_type)

#
world_happiness_table = agate.Table(country_hs_rows,titles,types)
world_happiness_table.print_table(max_columns=7)

country_happiness_scores = world_happiness_table.order_by('Country', reverse=True).limit(10)
for country in country_happiness_scores:
	print(country)

happiness_score_mean = world_happiness_table.aggregate(agate.Mean('Happiness Score'))
print(happiness_score_mean)

country_high_happiness = world_happiness_table.where(lambda r: r['Happiness Score'] > 7)
for country in country_high_happiness:
	print(country)

first_country_under_three = world_happiness_table.find(lambda r: r['Happiness Score'] < 3)
print(first_country_under_three)

#calculate the correlation between Happiness Score and Dystopia Residual
correlation = numpy.corrcoef(
		[float(value) for value in world_happiness_table.columns["Happiness Score"].values()],
		[float(compare_value) for compare_value in world_happiness_table.columns["Dystopia Residual"].values()]
	)[0,1]
print(correlation)
Example #8
0
def means(stats):
    return stats.aggregate([('update_time_mean', agate.Mean('update_time')),
                            ('license_mean', agate.Mean('license')),
                            ('format_mean', agate.Mean('format'))])
                            'Countries and areas',
                            inner=True)
country_json = json.loads(
    open('../../data/chp9/earth-cleaned.json', 'rb').read())

for dct in country_json:
    country_dict[dct['name']] = dct['parent']

cpi_and_cl = cpi_and_cl.compute([
    ('continent', agate.Formula(text_type, get_country)),
])

for r in cpi_and_cl.rows:
    print r['Country / Territory'], r['continent']

grp_by_cont = cpi_and_cl.group_by('continent')
grp_by_cont

for cont, table in grp_by_cont.items():
    print cont, len(table.rows)

agg = grp_by_cont.aggregate([('cl_mean', agate.Mean('Total (%)')),
                             ('cl_max', agate.Max('Total (%)')),
                             ('cpi_median', agate.Median('CPI 2013 Score')),
                             ('cpi_min', agate.Min('CPI 2013 Score'))])

agg
agg.print_table()

agg.print_bars('continent', 'cl_max')
Example #10
0
# 출력한 결과를 보면 백분율 값 중 None이 존재한다.
# agate 표의 where 메서드를 활용하면 이러한 값들을 제거할 수 있다.
female_data = table.where(lambda x: x['Female'] is not None)
most_females = female_data.order_by('Female', reverse=True).limit(10)

for r in most_females.rows:
    print(f'{r["Countries and areas"]}: {r["Female"]}%')
print()

# 도시의 평균 아동 노동률을 구해보자.
# 이를 위해서는 Place of residence (%) Urban 열의 평균 값을 구해야 한다.
# Null 값을 제외하고 aggregate에 Mean 함수를 넣어 계산해본다.
# 이를 이용하여 Min과 Max도 계산이 가능하다.
has_por = table.where(lambda x: x['Place of residence (%) Urban'] is not None)
print(has_por.aggregate(agate.Mean('Place of residence (%) Urban')))
print()

# 지방 아동 노동률이 50% 이상인 행 가운데 하나를 찾아 보자.
# 조건을 만족하는 첫 번째 행을 반환한다.
first_match = has_por.find(lambda x: x['Rural'] > 50)
print(first_match['Countries and areas'])
print()

# 아동 노동률이 높은 국가의 순위를 알아보자
# 이를 위해서는 Total(%) 열을 기반으로 데이터를 정렬하면 된다.
ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print(row['Total (%)'], row['Total Child Labor Rank'])
with open(
        os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'africa_cpi_cl.pickle'), 'rb') as f:
    africa_cpi_cl = pickle.load(f)

plt.plot(africa_cpi_cl.columns['CPI 2013 Score'],
         africa_cpi_cl.columns['Total (%)'])

plt.xlabel('CPI Score - 2013')
plt.ylabel('Child Labor Percentage')
plt.title('CPI & Child Labor Correlation')

plt.show()

# 최악의 가해 국가들만 분리하여 데이터 시각화
cl_mean = africa_cpi_cl.aggregate(agate.Mean('Total (%)'))
cpi_mean = africa_cpi_cl.aggregate(agate.Mean('CPI 2013 Score'))


def highest_rates(row):
    if row['Total (%)'] > cl_mean and row['CPI 2013 Score'] < cpi_mean:
        return True
    return False


highest_cpi_cl = africa_cpi_cl.where(lambda x: highest_rates(x))

plt.plot(highest_cpi_cl.columns['CPI 2013 Score'],
         highest_cpi_cl.columns['Total (%)'])

plt.xlabel('CPI Score - 2013')
for r in most_females.rows:
    print('{}: {}%'.format(r['Countries and areas'], r['Female']))

female_data = table.where(lambda r: r['Female'] is not None)
most_females = female_data.order_by('Female', reverse=True).limit(10)
for r in most_females.rows:
    print('{}: {}%'.format(r['Countries and areas'], r['Female']))

try:
    table.columns['Place of residence (%) Urban'].mean()
except:
    pass

has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None)

has_por.columns['Place of residence (%) Urban'].aggregate(agate.Mean())
has_por.columns['Place of residence (%) Urban'].aggregate(agate.Max())

has_por.columns['Rural'].aggregate(agate.Mean())
has_por.columns['Rural'].aggregate(agate.Max())

has_por.find(lambda x: x['Rural'] > 50)

ranked = table.compute([(agate.Rank('Total (%)',
                                    reverse=True), 'Total Child Labor Rank')])

# If we wanted a column showing children not working percentage ...


def reverse_percent(row):
    return 100 - row['Total (%)']
Example #13
0
    10)  # top 10 countries with highest child labor
print("\nCountries with highest child labor: ")
for r in most_egregious.rows:
    print("{} - {}".format(r[0], r[1]))  # print result

female_data = table.where(lambda r: r['Female'] is not None)  # get girl data
most_females = female_data.order_by('Female', reverse=True).limit(
    10)  # top 10 countries with highest girl labor
print("\nTop 10 countries with highest girl labor:")
for r in most_females.rows:
    print("{} - {}".format(r[0], r['Female']))  # print result

has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None
                      )  # remove empty data
print("\nAverage % of child labor in cities: {}".format(
    has_por.aggregate(agate.Mean('Place of residence (%) Urban')))
      )  # print result average% of child labour

first_match = has_por.find(lambda x: x[
    'Rural'] > 50)  # Find a row with more than 50 % of rural child labor
print("\nFirst row with more than 50% of rural child labor is - {}".format(
    first_match['Countries and areas']))

print(
    "\nList of top 20 worst offenders: "
)  # Rank the worst offenders in terms of child labor percentages by country

top_ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])
for r in top_ranked.order_by('Total (%)', reverse=True).limit(20).rows:
Example #14
0
#!/usr/bin/env python

import agate


table = agate.Table.from_csv('data/hmda_lar__smith_county__all_years__filtered.csv', column_types={
    'census_tract_number_string': agate.Text()
})

grouped = table.group_by('census_tract_number_string')

summarized = grouped.aggregate([
    ('count', agate.Count()),
    ('avg_applicant_income_000s', agate.Mean('applicant_income_000s')),
    ('avg_loan_amount_000s', agate.Mean('loan_amount_000s'))
])

summarized.to_csv('data/census_tract_summary.csv')
Example #15
0
    country_json = json.load(f)

country_dict = {}
for dct in country_json:
    country_dict[dct['name']] = dct['parent']

cpi_and_cl = cpi_and_cl.compute([('continent', agate.Formula(agate.Text(), get_country))])

grp_by_cont = cpi_and_cl.group_by('continent')
print(grp_by_cont)

for cont, table in grp_by_cont.items():
    print(cont, len(table.rows))

# 눈으로 확인했을 때 아프리카와 아시아가 높은 값을 가지는 것을 확인할 수 있다.
# 하지만 이것만으로 데이터에 접근하기엔 쉽지 않다.
# 이 때 필요한 것이 집계 메서드이다.
# 국민들이 인식하는 정부 부패 및 아동 노동과 관련하여 대륙들이 어떻게 다른지 비교해보자.
agg = grp_by_cont.aggregate([
    ('cl_mean', agate.Mean('Total (%)')),
    ('cl_max', agate.Max('Total (%)')),
    ('cpi_median', agate.Median('CPI 2013 Score')),
    ('cpi_min', agate.Min('CPI 2013 Score'))
])
agg.print_table()
print()
agg.print_bars('continent', 'cl_max')

with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cpi_and_cl_2.pickle'), 'wb') as f:
    pickle.dump(cpi_and_cl, f)
rows = [[float(val)] for val in list(df_orig.values)]
table = ag.Table(rows, column_names, column_types)

#%%
print(table.column_names)
outliers = table.stdev_outliers(column_name='Max Amount',
                                deviations=3,
                                reject=True)
print(len(outliers.rows))

outlier_list = []
for row in outliers.rows:
    outlier_list.append(float(row['Max Amount']))
    print(row['Max Amount'])

print("Mean: {}".format(table.aggregate(ag.Mean('Max Amount'))))
"""
11 Found:
116.99
116.99
116.99
118.99
100.0
116.99
312.95
310.95
311.95
312.95
69.95
Mean: 12.47918588873812754409769335
"""
Example #17
0
#%%[markdown]
# ### 2) Which countries have the most girls working?

#%%
female_data = table.where(lambda r: r['Female'] is not None)
most_females = female_data.order_by('Female', reverse=True).limit(5)

for r in most_females.rows:
    print('{}: {}%'.format(r['Countries and areas'], r['Female']))

#%%[markdown]
# ### 3) What is the average percentage of child labor in cities?
#%%
has_poor = table.where(lambda r: r['Place of residence (%)Urban'] is not None)
avg_percent = has_poor.aggregate(agate.Mean('Place of residence (%)Urban'))

print('Average percentage of child labor in cities: {}%'.format(
    round(avg_percent, 2)))

#%%[markdown]
# ### 4) Find a row with more than 50% of rural child labor.
#%%
has_poor = table.where(lambda r: r['Rural'] is not None)
first_match = has_poor.find(lambda x: x['Rural'] > 50)

print('The first row with > 50% rural child labor is',
      first_match['Countries and areas'])

#%%[markdown]
# ### 5) Rank the worst offenders in terms of child labor percentages by country.