def test_cleanup(self): self.assertFalse(os.path.exists(TEST_CACHE)) analysis = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis.then(self.stage2) data = {} # Initial run, creates two cache files analysis.run(data) cache_files = glob(os.path.join(TEST_CACHE, '*.cache')) self.assertEqual(len(cache_files), 2) # Create false third cache file open(os.path.join(TEST_CACHE, 'foo.cache'), 'a').close() cache_files2 = glob(os.path.join(TEST_CACHE, '*.cache')) self.assertEqual(len(cache_files2), 3) # Second run, removes false cache file analysis.run(data) cache_files3 = glob(os.path.join(TEST_CACHE, '*.cache')) self.assertEqual(len(cache_files3), 2) self.assertSequenceEqual(cache_files, cache_files3)
def test_cache_reused(self): analysis = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis.then(self.stage2) analysis.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage2, 1) analysis2 = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis2.then(self.stage2) analysis2.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage2, 1)
def test_ancestor_changed(self): analysis = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) noop = analysis.then(self.stage_noop) noop.then(self.stage2) analysis.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage2, 1) analysis2 = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis2.then(self.stage2) analysis2.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage2, 2)
def test_same_function_twice_sequence(self): analysis = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis.then(self.stage2) analysis.then(self.stage_noop) analysis.then(self.stage2) analysis.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage2, 2)
def test_never_cache(self): analysis = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis.then(self.stage_never_cache) analysis.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage_never_cache, 1) analysis.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage_never_cache, 2)
def test_data_flow(self): analysis = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis.then(self.stage2) data = {} analysis.run(data) self.assertEqual(data, {}) self.assertEqual(self.data_before_stage1, {}) self.assertEqual(self.data_after_stage1, {'stage1': 5}) self.assertEqual(self.data_before_stage2, {'stage1': 5}) self.assertEqual(self.data_after_stage2, {'stage1': 5, 'stage2': 25})
def test_ancestor_fingerprint_deleted(self): analysis = proof.Analysis(self.stage1, cache_dir=TEST_CACHE) analysis.then(self.stage2) analysis.run() self.assertEqual(self.executed_stage1, 1) self.assertEqual(self.executed_stage2, 1) os.remove(analysis._cache_path) analysis.run() self.assertEqual(self.executed_stage1, 2) self.assertEqual(self.executed_stage2, 2)
data['year_police_beat'].print_table() def print_year_data(data): data['groupped_year'].print_table() def print_full_hour_data(data): data['full_hour'].print_table() def print_data(data): data['table'].print_table(max_columns=None) data_loaded = proof.Analysis(load_data) year_data = data_loaded.then(add_year_column) groupped_data = year_data.then(year_sum_counts) groupped_data.then(upload_killed_injured_year) year_police_beat_data = year_data.then(year_police_beat_sum_counts) year_police_beat_data.then(upload_killed_injured_year_police_beat) data_loaded.then(upload_accidents) hour_data = data_loaded.then(add_full_hour_date) full_hour_data = hour_data.then(sum_counts_by_full_hour) full_hour_data.then(upload_full_hour) data_loaded.run()
race_groups = only_with_age.group_by('race') # Sub-group by age cohorts (20s, 30s, etc.) race_and_age_groups = race_groups.group_by( lambda r: '%i0s' % (r['age'] // 10), key_name='age_group' ) # Aggregate medians for each group medians = race_and_age_groups.aggregate([ ('count', agate.Count()), ('median_years_in_prison', agate.Median('years_in_prison')) ]) # Sort the results sorted_groups = medians.order_by('median_years_in_prison', reverse=True) # Print out the results sorted_groups.print_table(max_rows=10) analysis = proof.Analysis(load_data) analysis.then(confessions) analysis.then(median_age) analysis.then(youth) years_analysis = analysis.then(years_in_prison) years_analysis.then(states) years_analysis.then(race_and_age) analysis.run()
def test_cache_unicode(self): analysis = proof.Analysis(self.stage_unicode, cache_dir=TEST_CACHE) analysis.run() self.assertEqual(self.executed_stage_unicode, 1)
def main(): data_pipeline = proof.Analysis(load_data) pipeline = prepare(data_pipeline) save_train(data_pipeline,pipeline)