def _overall_stats(self): count_open_licenses = agate.Summary( 'license_id', agate.Number(), lambda r: sum(license_id in utils.OPEN_LICENSES for license_id in r.values())) self.overall_package_stats = self._package_table().aggregate([ ('open_data_count', count_open_licenses), ]) self.resource_stats = self._package_resource_table().compute([ ('open_format', agate.Formula(agate.Boolean(), open_formats_count)), ]) if len(self._package_resource_table()) > 0: self.resource_stats = self.resource_stats.aggregate([ ('open_format_count', agate.Count('open_format', True)), ('min_date', agate.Min('created')), ('max_date', agate.Max('created')) ]) format_table = self._package_resource_table().group_by( "format").aggregate([ ('count', agate.Count()), ]) count = format_table.aggregate([ ('different_formats', agate.Count()), ]) self.open_datasets = self.overall_package_stats.get( "open_data_count", 0) self.open_format_count = self.resource_stats.get( "open_format_count", 0) self.format_count = count.get("different_formats", 0) self.compute_dates()
def get_package_date_aggregates(self, package_table): return package_table.aggregate([('min_date', agate.Min('created')), ('max_date', agate.Max('created'))])
country_json = json.load(f) country_dict = {} for dct in country_json: country_dict[dct['name']] = dct['parent'] cpi_and_cl = cpi_and_cl.compute([('continent', agate.Formula(agate.Text(), get_country))]) grp_by_cont = cpi_and_cl.group_by('continent') print(grp_by_cont) for cont, table in grp_by_cont.items(): print(cont, len(table.rows)) # 눈으로 확인했을 때 아프리카와 아시아가 높은 값을 가지는 것을 확인할 수 있다. # 하지만 이것만으로 데이터에 접근하기엔 쉽지 않다. # 이 때 필요한 것이 집계 메서드이다. # 국민들이 인식하는 정부 부패 및 아동 노동과 관련하여 대륙들이 어떻게 다른지 비교해보자. agg = grp_by_cont.aggregate([ ('cl_mean', agate.Mean('Total (%)')), ('cl_max', agate.Max('Total (%)')), ('cpi_median', agate.Median('CPI 2013 Score')), ('cpi_min', agate.Min('CPI 2013 Score')) ]) agg.print_table() print() agg.print_bars('continent', 'cl_max') with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cpi_and_cl_2.pickle'), 'wb') as f: pickle.dump(cpi_and_cl, f)
courselist.output_csv('courselist') selected_courses.to_csv('temp.csv') selected_courses = agate.Table.from_csv( 'temp.csv') # workaround to fix .normalize() indexing issue os.remove('temp.csv') selected_courses = selected_courses.join(courselist,'courseid','id') \ .join(prefs_n,['student','course'],['student','course']) \ .select(['student','courseid','course','block','preference']) selected_courses = selected_courses.order_by(lambda r: (r['student'], r['block'])) selected_courses.output_csv('selections_by_student') selected_courses.select(['course', 'block', 'courseid', 'student']).order_by( lambda r: (r['course'], r['block'])).output_csv('selections_by_course') # finally output something on the console selected_courses.pivot('block').print_table() print('Highest preference used:') selected_courses.pivot('student',aggregation=agate.Max('preference')) \ .pivot('Max') \ .order_by('Max') \ .print_table()
return new_table if __name__ == '__main__': data_lists = generate_test_data() # Create data table tbl = agate.Table(data_lists, column_names=column_names, column_types=column_types) # Produce summary table by_payband = tbl.group_by('pb') summary_tbl = by_payband.aggregate([('count', agate.Count()), ('sal_min', agate.Min('salary')), ('sal_max', agate.Max('salary')), ('sal_median', agate.Median('salary')) ]) # Display summary of generated test data print('Model data summary:\n') summary_tbl.print_table() print() summary_tbl.print_bars('pb', 'count', width=40) # ---Generate random numbers for simulation new_table = _add_random_column(tbl) # Show distributions of new table rand_tbl_count = new_table.pivot('random_group') rand_tbl_count = rand_tbl_count.order_by('random_group')
print('{}: {}%'.format(r['Countries and areas'], r['Female'])) female_data = table.where(lambda r: r['Female'] is not None) most_females = female_data.order_by('Female', reverse=True).limit(10) for r in most_females.rows: print('{}: {}%'.format(r['Countries and areas'], r['Female'])) try: table.columns['Place of residence (%) Urban'].mean() except: pass has_por = table.where(lambda r: r['Place of residence (%) Urban'] is not None) has_por.columns['Place of residence (%) Urban'].aggregate(agate.Mean()) has_por.columns['Place of residence (%) Urban'].aggregate(agate.Max()) has_por.columns['Rural'].aggregate(agate.Mean()) has_por.columns['Rural'].aggregate(agate.Max()) has_por.find(lambda x: x['Rural'] > 50) ranked = table.compute([(agate.Rank('Total (%)', reverse=True), 'Total Child Labor Rank')]) # If we wanted a column showing children not working percentage ... def reverse_percent(row): return 100 - row['Total (%)']