Exemple #1
0
def subset(data):
    subset = data['table'].where(
        lambda r: r['origin'] in SELECTED_COUNTRIES and r['year'] >= 1980)
    groups = subset.group_by(lambda r: '/'.join([str(r['year']), r['origin']]),
                             key_name='year_and_origin')

    refugees = groups.aggregate([
        ('refugees', agate.Sum('refugees')),
        ('asylum_seekers', agate.Sum('asylum_seekers')),
        ('returned_refugees', agate.Sum('returned_refugees')),
        ('idps', agate.Sum('idps')),
        ('returned_idps', agate.Sum('returned_idps')),
        ('stateless_persons', agate.Sum('stateless_persons')),
        ('others', agate.Sum('others')), ('total', agate.Sum('total'))
    ]).order_by('year_and_origin', reverse=True)

    refugees = refugees.compute([
        ('year',
         agate.Formula(agate.Text(),
                       lambda r: r['year_and_origin'].split('/')[0])),
        ('origin',
         agate.Formula(agate.Text(),
                       lambda r: r['year_and_origin'].split('/')[1]))
    ])

    refugees = refugees.select([
        'origin', 'year', 'refugees', 'asylum_seekers', 'idps',
        'returned_idps', 'stateless_persons', 'others', 'total'
    ])

    refugees.to_csv('subset.csv')
    refugees.pivot(
        'year', 'origin',
        agate.Sum('total')).order_by('year').to_csv('subset_pivot.csv')
Exemple #2
0
    def _analyze_date(self, event):
        table = self.table.where(
            lambda row: row["fields/date"] is not None).compute([
                (
                    "reduce_to_date",
                    agate.Formula(
                        agate.Text(),
                        lambda row: helpers.reduce_to_date(row["fields/date"]),
                    ),
                ),
                (
                    "reduce_to_year",
                    agate.Formula(
                        agate.Number(),
                        lambda row: helpers.reduce_to_year(row["fields/date"]),
                    ),
                ),
                (
                    "reduce_to_time",
                    agate.Formula(
                        agate.Number(),
                        lambda row: helpers.reduce_to_time(row["fields/date"]),
                    ),
                ),
            ])

        years = table.distinct(
            "reduce_to_year").columns["reduce_to_year"].values()

        _data = {}

        for year in years:
            _data[year] = (table.where(lambda row: row[
                "reduce_to_year"] == year).select("reduce_to_date").pivot(
                    "reduce_to_date").order_by("reduce_to_date"))

        event.set()

        print(f"\n\n{helpers.h1_icn} Date\n")

        for year in years:
            data_keys = list(_data[year].columns["reduce_to_date"].values())
            _counts = list(
                map(int, list(_data[year].columns["Count"].values())))
            _sum = sum(_counts)
            data_count = [[i] for i in _counts]

            args = {
                "color": False,
                "custom_tick": False,
                "start_dt": f"{year}-01-01"
            }

            print(f"\n{helpers.h2_icn} Year {year} ({_sum:,} emails)\n")
            calendar_heatmap(data=data_count, args=args, labels=data_keys)
Exemple #3
0
def time_span():

    candidate_contribs_with_monthyear = ky_candidate_contributions.compute([
        ('month_year',
         agate.Formula(text_type, lambda r: r['contb_receipt_dt'][-6:])),
        ('date',
         agate.Formula(
             text_type, lambda r: datetime.datetime.strptime(
                 r['contb_receipt_dt'], '%d-%b-%y')))
    ])

    candidate_contribs_with_monthyear.to_csv('data/csv/process/' + datestamp +
                                             'ky-contribs-with-monthyear.csv')
Exemple #4
0
def add_value(data):
    key = {
        0: 'no data',
        1: 'right',
        2: 'center',
        3: 'left',
        None: 'no executive'
    }

    data['value_added'] = data['dpi'].compute([
        ('alignment', agate.Formula(text, lambda r: key[r['execrlc']])),
        ('any_nationalist', agate.Formula(boolean, lambda r: r['gov1nat'] or r['gov2nat'] or r['gov3nat'] or r['opp1nat'])),
        ('any_regionalist', agate.Formula(boolean, lambda r: bool(r['gov1reg'] or r['gov2reg'] or r['gov3reg'] or r['opp1reg'])))
    ])
Exemple #5
0
def calc_table(in_csv, out_csv):

    table = agate.Table.from_csv(in_csv,
                                 column_names=column_names,
                                 column_types=column_types)
    table = table.pivot('HIEDUC', 'STAYTOG')

    table = table.compute([('Total', agate.Formula(agate.Number(),
                                                   get_total))])
    table = table.compute([('Percent agree',
                            agate.Formula(agate.Number(), get_percent_agree))])
    table.to_csv(out_csv)

    return table
 def test_grouping(self):
     exonerations = agate.Table.from_csv(
         '../../../data/exonerations-20150828.csv')
     clean_state_data = exonerations.compute(
         [('federal',
           agate.Formula(agate.Boolean(),
                         lambda row: row['state'].startswith('F-'))),
          ('state',
           agate.Formula(
               agate.Text(), lambda row: row['state'][2:]
               if row['state'].startswith('F-') else row['state']))],
         replace=True)
     by_state = clean_state_data.group_by('state')
     state_totals = by_state.aggregate([('count', agate.Count())])
     sorted_totals = state_totals.order_by('count', reverse=True)
     sorted_totals.print_table(max_rows=10)
def add_full_hour_date(data):
    data['table'] = data['table'].compute([
        ('date_hour',
         agate.Formula(
             agate.Text(),
             lambda r: r['date_time'].strftime("%Y-%m-%d %H:00:00"))),
    ])
Exemple #8
0
def compute_ranks(table):
    table = table.compute([
        ('dataset_rank', agate.Rank('datasets', reverse=True)),
        ('formats_rank', agate.Rank('format_count', reverse=True)),
        #('open_formats_rank', agate.Rank('open_formats', reverse=True)),
        ('last_update_rank', agate.Rank('days_since_last_update')),
        #('open_datasets_rank', agate.Rank('open_datasets', reverse=True)),
        ('category_rank', agate.Rank('category_count', reverse=True)),
        ('category_variance_rank', agate.Rank('category_variance')),
        #('update_start_rank', agate.Rank('days_between_start_and_last_update')),
        #('start_rank', agate.Rank('days_since_start', reverse=True)),
        #('openess_score', agate.Formula(number, openness_score)),
        ('dataset_score_rank', agate.Rank('dataset_score', reverse=True)),
        ('category_score_rank', agate.Rank('category_score', reverse=True)),
    ])
    table = table.compute([
        ('dataset_rank_std', StandadizeScore('dataset_rank')),
        ('formats_rank_std', StandadizeScore('formats_rank')),
        ('last_update_rank_std', StandadizeScore('last_update_rank')),
        ('category_rank_std', StandadizeScore('category_rank')),
        ('category_variance_rank_std', StandadizeScore('category_variance_rank')),
        ('dataset_score_rank_std', StandadizeScore('dataset_score_rank')),
        ('category_score_rank_std', StandadizeScore('category_score_rank')),
        ])
    table = table.compute([
        ('overall_rank_data', agate.Formula(agate.Number(), overall_rank))
    ])
    table = table.compute([
        ('overall_rank', agate.Rank('overall_rank_data')),
        ])
    return table
    def _overall_stats(self):
        count_open_licenses = agate.Summary(
            'license_id', agate.Number(),
            lambda r: sum(license_id in utils.OPEN_LICENSES
                          for license_id in r.values()))

        self.overall_package_stats = self._package_table().aggregate([
            ('open_data_count', count_open_licenses),
        ])
        self.resource_stats = self._package_resource_table().compute([
            ('open_format', agate.Formula(agate.Boolean(),
                                          open_formats_count)),
        ])
        if len(self._package_resource_table()) > 0:
            self.resource_stats = self.resource_stats.aggregate([
                ('open_format_count', agate.Count('open_format', True)),
                ('min_date', agate.Min('created')),
                ('max_date', agate.Max('created'))
            ])
            format_table = self._package_resource_table().group_by(
                "format").aggregate([
                    ('count', agate.Count()),
                ])
            count = format_table.aggregate([
                ('different_formats', agate.Count()),
            ])
            self.open_datasets = self.overall_package_stats.get(
                "open_data_count", 0)
            self.open_format_count = self.resource_stats.get(
                "open_format_count", 0)
            self.format_count = count.get("different_formats", 0)
            self.compute_dates()
Exemple #10
0
def _add_random_column(data_tbl):
    # Reset seed to produce random numbers
    npr.seed()

    new_table = data_tbl.compute([('random_group',
                                   agate.Formula(agate.Number(),
                                                 _generate_random))])
    return new_table
Exemple #11
0
def monthly_contrib_count():

    candidate_contribs_with_monthyear = ky_candidate_contributions.compute([
        ('month_year',
         agate.Formula(text_type, lambda r: r['contb_receipt_dt'][-6:])),
        ('date',
         agate.Formula(
             text_type, lambda r: datetime.datetime.strptime(
                 r['contb_receipt_dt'], '%d-%b-%y')))
    ])

    july16_contributions = candidate_contribs_with_monthyear.where(
        lambda r: r['month_year'] == 'JUL-16')

    positive_contributions = july16_contributions.where(
        lambda r: r['contb_receipt_amt'] > 0)

    candidate_brackets(july16_contributions)
 def get_org_format_aggregates(self, package_table):
     # format can not exist!?!
     format_table = package_table.group_by("format").aggregate([
         ('count', agate.Count()),
     ])
     open_format_table = package_table.compute([
         ('open_format', agate.Formula(agate.Boolean(), open_formats_count))
     ])
     open_format_table_aggregates = open_format_table.aggregate([
         ('open_formats', agate.Count('open_format', True)),
     ])
     new_table = format_table.compute([
         ('open_format', agate.Formula(agate.Boolean(), open_formats_count))
     ])
     count = new_table.aggregate([
         ('different_formats', agate.Count()),
         ('open_formats', agate.Count('open_format', True)),
     ])
     count["open_formats_datasets"] = open_format_table_aggregates[
         "open_formats"]
     return count
 def compute_ranks(self):
     self.orgs_table = self.orgs_table.compute([
         ('dataset_rank', agate.Rank('datasets')),
         ('formats_rank', agate.Rank('format_count')),
         ('open_formats_rank', agate.Rank('datasets')),
         ('last_update_rank', agate.Rank('days_since_last_update')),
         ('open_datasets_rank', agate.Rank('open_datasets')),
     ])
     self.orgs_table = self.orgs_table.compute([
         ('overall_rank_data', agate.Formula(number, overall_rank)),
     ])
     self.orgs_table = self.orgs_table.compute([
         ('overall_rank', agate.Rank('overall_rank_data', reverse=True)),
     ])
Exemple #14
0
def sum_counts_by_hour(data):
    data['hour'] = data['table'].group_by('hour').aggregate([
        ('killed', agate.Sum('killed')), ('injured', agate.Sum('injured')),
        ('accidents', agate.Count()),
        ('accidents_injured', count_accidents_injured)
    ]).compute([
        ('killed_percent', agate.Percent('killed')),
        ('injured_percent', agate.Percent('injured')),
        ('accidents_percent', agate.Percent('accidents')),
    ]).compute([
        ('weighted',
         agate.Formula(agate.Number(),
                       lambda r: r['killed_percent'] + r['injured_percent'])),
        ('accidents_within_half_deviation',
         StandardDeviations('accidents', 0.5)),
        ('killed_within_half_deviation', StandardDeviations('killed', 0.5)),
        ('injured_within_half_deviation', StandardDeviations('injured', 0.5))
    ])
    return data
Exemple #15
0
def update_where(self, update_col, update_val, test_col, test_val):
    # check the types of the update and test columns.
    colnames = self.column_names
    types = [
        self.column_types[colnames.index(update_col)],
        self.column_types[colnames.index(test_col)]
    ]
    # start the magic
    self = self.join(self.select([update_col, test_col]) \
        .rename(column_names = {update_col: 'update_col', test_col: 'test_col'})
        )
    self = self.join(
        agate.Table([[update_val, test_val]] * len(self.rows),
                    ['update_val', 'test_val'], types))
    self = self.compute([('updated',
                          agate.Formula(agate.Number(),
                                        update_where_function))])
    self = self.rename(column_names = {update_col: 'old', 'updated': update_col}) \
        .exclude(['old', 'update_col', 'test_col', 'update_val', 'test_val'])
    return self
Exemple #16
0
    def _analyze_count(self, event):
        # Average emails per day
        total = self.table.aggregate([("total", agate.Count())])["total"]
        total_senders = (
            self.table.distinct("fields/from").select("fields/from").aggregate(
                [("total", agate.Count())])["total"])

        if total == 0:
            first_email_date = ""
            last_email_date = None
        else:
            date_data = self.table.where(
                lambda row: row["fields/date"] is not None).compute([(
                    "reduce_to_datetime",
                    agate.Formula(
                        agate.DateTime(datetime_format="%Y-%m-%d %H:%M:%S"),
                        lambda row: helpers.reduce_to_datetime(row[
                            "fields/date"]),
                    ),
                )])
            first_email_date = (date_data.order_by("reduce_to_datetime").limit(
                1).columns["fields/date"].values()[0])
            last_email_date = (date_data.order_by(
                "reduce_to_datetime",
                reverse=True).limit(1).columns["fields/date"].values()[0])
        event.set()

        metrics = [
            ["Total emails", total],
            ["Senders", total_senders],
            ["First Email Date", first_email_date],
        ]

        if last_email_date:
            date_delta = helpers.convert_date(
                last_email_date) - helpers.convert_date(first_email_date)
            avg_email_per_day = total / date_delta.days
            metrics.append(["Avg. Emails/Day", f"{avg_email_per_day:.2f}"])

        print(f"\n\n{helpers.h1_icn} Stats\n")
        print(termtables.to_string(metrics))
Exemple #17
0
def to_and_from(data):
    refugees = data['table'].select(
        ['origin', 'residence', 'year', 'refugees'])

    by_year = refugees.group_by('year')

    by_origin = (by_year.group_by('origin').aggregate([
        ('origin_refugees', agate.Sum('refugees'))
    ]))

    by_residence = (by_year.group_by('residence').aggregate([
        ('residence_refugees', agate.Sum('refugees'))
    ]))

    def comparison(r):
        origin = r['origin_refugees']
        residence = r['residence_refugees']

        if not origin:
            return None

        if not residence:
            return None

        return 1 - (abs(origin - residence) / (origin + residence))

    joined = (by_origin.join(by_residence, lambda r: (r['year'], r['origin']),
                             lambda r: (r['year'], r['residence'])).exclude([
                                 'residence', 'year2'
                             ]).rename(column_names={
                                 'origin': 'country'
                             }).compute([
                                 ('ratio',
                                  agate.Formula(agate.Number(), comparison))
                             ]))

    joined.to_csv('joined.csv')
Exemple #18
0
def main():
    table = agate.Table.from_csv('data.csv')

    print(list(zip(table.column_names, table.column_types)))

    with_decimals = table.compute([
        ('year', agate.Formula(agate.Text(), lambda r: r['Date'].year)),
        ('lat', DecimalDegrees('Latitude')),
        ('lng', DecimalDegrees('Longitude')),
    ])

    with_decimals.to_csv('with_decimals.csv')

    by_year = with_decimals.group_by('year')
    by_usable = by_year.group_by(
        lambda r: r['lat'] is not None and r['lng'] is not None)

    by_year.aggregate([('count', agate.Length())]).print_csv()

    by_usable.aggregate([('count', agate.Length())]).print_table()

    data = {}

    for year, table in by_year.items():
        data[year] = []

        for row in table.rows:
            if not row['lat'] or not row['lng']:
                continue

            data[year].append(
                geojson.Point([float(row['lng']),
                               float(row['lat'])]))

    with open('src/data/attacks.json', 'w') as f:
        geojson.dump(data, f, sort_keys=True)
has_por.columns['Rural'].aggregate(agate.Mean())
has_por.columns['Rural'].aggregate(agate.Max())

has_por.find(lambda x: x['Rural'] > 50)

ranked = table.compute([(agate.Rank('Total (%)',
                                    reverse=True), 'Total Child Labor Rank')])

# If we wanted a column showing children not working percentage ...


def reverse_percent(row):
    return 100 - row['Total (%)']


table = table.compute([(agate.Formula(number_type, reverse_percent),
                        'Children not working (%)')])

# some investigation into other possible connections

hiv_workbook = xlrd.open_workbook(DATA_FOLDER + 'hiv_aids_2014.xlsx')
hiv_workbook.sheet_names()
children = hiv_workbook.sheet_by_name('T6_CABA')
hiv_titles = zip(children.row_values(3), children.row_values(4),
                 children.row_values(5))
hiv_titles = [' '.join(list(t)) for t in hiv_titles]
hiv_titles = [t.strip() for t in hiv_titles]

for row in range(0, children.nrows):
    print(row, children.row(row))
Exemple #20
0
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print(row['Total Child Labor Rank'], '\t', row['Countries and areas'],
          row['Total (%)'])

#%%[markdown]
# ### 6) Calculate the percentage of children not involved in child labor.


#%%
def reverse_percent(row):
    return 100 - row['Total (%)']


ranked = table.compute([
    ('Children not working (%)', agate.Formula(number_type, reverse_percent)),
])
ranked = ranked.compute([
    ('Total Child Labor Rank', agate.Rank('Children not working (%)')),
])

percent_not_working = ranked.aggregate(agate.Mean('Children not working (%)'))
print(
    'The total % of children not working is {}%'.format(
        round(percent_not_working, 2)), '\n')

print('The best countries for children not working are:')

for row in ranked.order_by('Total (%)', reverse=False).limit(20).rows:
    print(row['Countries and areas'], row['Total (%)'],
          row['Children not working (%)'])
def add_year_column(data):
    data['table'] = data['table'].compute([
        ('year',
         agate.Formula(agate.Text(), lambda r: r['date_time'].strftime("%Y"))),
    ])
Exemple #22
0
# 손실 자료만 뽑아서 확인해보면 매칭되지 않은 국가는 많지 않다.
# earth.json 파일을 클리닝 하는 것을 권장한다.
# no_continent = cpi_and_cl.where(lambda x: x['continent'] is None)
#
# for r in no_continent.rows:
#     print(r['Country / Territory'])

# 클리닝된 대륙 데이터
with open(os.path.join(data_dir, 'chp9', 'earth-cleaned.json'), 'r') as f:
    country_json = json.load(f)

country_dict = {}
for dct in country_json:
    country_dict[dct['name']] = dct['parent']

cpi_and_cl = cpi_and_cl.compute([('continent', agate.Formula(agate.Text(), get_country))])

grp_by_cont = cpi_and_cl.group_by('continent')
print(grp_by_cont)

for cont, table in grp_by_cont.items():
    print(cont, len(table.rows))

# 눈으로 확인했을 때 아프리카와 아시아가 높은 값을 가지는 것을 확인할 수 있다.
# 하지만 이것만으로 데이터에 접근하기엔 쉽지 않다.
# 이 때 필요한 것이 집계 메서드이다.
# 국민들이 인식하는 정부 부패 및 아동 노동과 관련하여 대륙들이 어떻게 다른지 비교해보자.
agg = grp_by_cont.aggregate([
    ('cl_mean', agate.Mean('Total (%)')),
    ('cl_max', agate.Max('Total (%)')),
    ('cpi_median', agate.Median('CPI 2013 Score')),
Exemple #23
0
first_match = has_por.find(lambda x: x['Rural'] > 50)
print(first_match['Countries and areas'])

ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print row['Total (%)'], row['Total Child Labor Rank']


def reverse_percent(row):
    return 100 - row['Total (%)']


ranked = table.compute([
    ('Children not working (%)', agate.Formula(number_type, reverse_percent)),
])

ranked = ranked.compute([
    ('Total Child Labor Rank', agate.Rank('Children not working (%)')),
])

for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print row['Total (%)'], row['Total Child Labor Rank']

-------3 / 5

import agate
import xlrd

from xlrd.sheet import ctype_text
Exemple #24
0
OUTPUT_DIR = 'docs/samples'

if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

for filename in os.listdir(OUTPUT_DIR):
    os.remove(os.path.join(OUTPUT_DIR, filename))

tester = agate.TypeTester(force={
    ' Date': agate.Date('%Y-%m-%d')
})

emissions = agate.Table.from_csv('examples/epa-emissions-20150910.csv', tester)

emissions = emissions.compute([
    (agate.Formula(agate.Number(), lambda r: r[' Date'].day), 'day'),
    (agate.Formula(agate.Number(), lambda r: r[' SO2 (tons)'] or 0), 'so2'),
    (agate.Formula(agate.Number(), lambda r: r[' NOx (tons)'] or 0), 'noX'),
    (agate.Formula(agate.Number(), lambda r: r[' CO2 (short tons)'] or 0), 'co2')
])

states = emissions.group_by('State')
state_totals = states.aggregate([
    ('so2', agate.Sum(), 'so2'),
    ('co2', agate.Sum(), 'co2'),
    ('noX', agate.Sum(), 'noX')
])

new_york = states['NY']

# NB: key_type shouldn't be necessary--agate bug #234
Exemple #25
0
def percentages(column, precision=2):
    return (column,
            agate.Formula(agate.Text(), lambda r: ('%%.%df' % precision) %
                          (r[column] * 100)))
import json
import agate

country_json = json.loads(open('../../data/chp9/earth.json', 'rb').read())
country_dict = {}

for dct in country_json:
    country_dict[dct['name']] = dct['parent']


def get_country(country_row):
    return country_dict.get(country_row['Country / Territory'].lower())


cpi_and_cl = cpi_and_cl.compute([
    ('continent', agate.Formula(text_type, get_country)),
])
cpi_and_cl.column_names

for r in cpi_and_cl.rows:
    print r['Country / Territory'], r['continent']

no_continent = cpi_and_cl.where(lambda x: x['continent'] is None)
for r in no_continent.rows:
    print r['Country / Territory']

cpi_and_cl = cpi_table.join(ranked,
                            'Country / Territory',
                            'Countries and areas',
                            inner=True)
country_json = json.loads(
Exemple #27
0
print()

# 아동 노동률이 높은 국가의 순위를 알아보자
# 이를 위해서는 Total(%) 열을 기반으로 데이터를 정렬하면 된다.
ranked = table.compute([
    ('Total Child Labor Rank', agate.Rank('Total (%)', reverse=True)),
])
for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print(row['Total (%)'], row['Total Child Labor Rank'])
print()


# reverse를 사용하지 않고 오름차순 정렬을 하고 싶다면 역 백분율을 기준으로 열을 생성하면 된다.
def reverse_percent(row):
    return 100 - row['Total (%)']


ranked = table.compute([('Children not working (%)',
                         agate.Formula(agate.Number(), reverse_percent))])
ranked = ranked.compute([
    ('Total Child Labor Rank', agate.Rank('Children not working (%)')),
])

for row in ranked.order_by('Total (%)', reverse=True).limit(20).rows:
    print(row['Total (%)'], row['Total Child Labor Rank'])

with open(
        os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'ranked.pickle'), 'wb') as f:
    pickle.dump(ranked, f)
Exemple #28
0
# Read in the courses table

course_params = agate.Table.from_csv(file_courses)

#~
#~ STEP 1
#~
#~ Create a shortlist of courses by using a popularity ranking based on preferences.
#~

# count the preference positions ("preference matrix") for each course
pref_count = prefs_n.pivot('course', 'preference')
# calculate a course popularity index and rank
pref_count = pref_count \
    .compute([
        ('pop', agate.Formula(agate.Number(), popularity_function))
    ]) \
    .compute([
        ('rank', agate.Rank('pop', reverse=True))
    ]) \
    .order_by('rank')

# reorder the preference columns
pref_count_na = list(
    pref_count.exclude(['course', 'pop', 'rank']).column_names)
pref_count_na.sort()
pref_count_na = ['rank', 'pop', 'course'] + pref_count_na
pref_count = pref_count.select(pref_count_na)

#output
pref_count.to_csv(outputdir + 'longlist.csv')
Exemple #29
0
# basic stats
by_game = data.group_by('game_id')

game_count = by_game.aggregate([('count', agate.Count())])

num_games = len(game_count.rows)
num_calls = len(data.rows)
num_calls_incorrect = len(incorrect.rows)
percent_incorrect = float(num_calls_incorrect) / float(num_calls) * 100.0
num_calls_incorrect_per_game = float(num_calls_incorrect) / float(num_games)

# calls incorrect per minute played
data_with_quarter = data.compute([
    ('quarter',
     agate.Formula(agate.Number(),
                   lambda r: int(r['period'].replace('Q', '')))),
    ('quarter_id',
     agate.Formula(agate.Text(), lambda r: r['game_id'] + '-' + r['period']))
])

# find total minutes played
unique_quarters = data_with_quarter.distinct('quarter_id').select(['quarter'])


def getMinutes(row):
    if (row['quarter'] == 4):
        return 2
    return 5


quarter_minutes = unique_quarters.compute([
# 计算Place of residence (%) Urban列的平均值
# table.aggregate(agate.Mean('Place of residence (%) Urban'))
# has_por = table.where(lambda r: r['Place of residence (%)Urban'] is not None)
# has_por.aggregate(agate.Mean('Place of residence (%)Urban'))
# 得出每行数据中农村童工雇佣率大于50%的数据
# first_match = has_por.find(lambda x: x['Rural'] > 50)
# print(first_match['Countries and areas'])

# 使用Total列数据对童工雇佣率最高的数据进行排序,从高到低
# ranked = table.compute([("Total Child Labor Rank", agate.Rank('Total (%)', reverse=True))])
# for row in ranked.order_by('Total (%)', reverse=True,).limit(20).rows:
# 	print(row['Total (%)'], row['Countries and areas'])
# 普通儿童占比
ranked = table.compute([("Children not working (%)",
                         agate.Formula(number_type, reverse_percent))])
ranked = ranked.compute([("Total Child Labor Rank",
                          agate.Rank("Children not working (%)"))])
# for row in ranked.order_by('Total (%)', reverse=True,).limit(20).rows:
# 	print(row['Total (%)'], row['Countries and areas'])

# 国际公开腐败感指数处理
cpi_workbook = xlrd.open_workbook('data/chp9/corruption_perception_index.xls')
cpi_sheet = cpi_workbook.sheets()[0]
# for r in range(cpi_sheet.nrows):
# 	print(r, cpi_sheet.row_values(r))

cpi_title_rows = zip(cpi_sheet.row_values(1), cpi_sheet.row_values(2))
cpi_titles = [t[0] + ' ' + t[1] for t in cpi_title_rows]
cpi_titles = [t.strip() for t in cpi_titles]
cpi_titles[0] = cpi_titles[0] + " Duplicate"