def test_order_by_preserves_rows(self): table = Table(self.rows, self.column_types, self.column_names) table2 = table.order_by(lambda r: r['one']) table3 = table2.order_by(lambda r: r['one']) self.assertIsNot(table._data[0], table2._data[0]) self.assertIs(table2._data[0], table3._data[0])
def test_where_preserves_rows(self): table = Table(self.rows, self.column_types, self.column_names) table2 = table.where(lambda r: r['one'] == 1) table3 = table2.where(lambda r: r['one'] == 1) self.assertIsNot(table._data[0], table2._data[0]) self.assertIs(table2._data[0], table3._data[0])
def analyse_insights(): """ generate reports from insights data """ column_types = (date_type, number_type, number_type, number_type, number_type, number_type, boolean_type, text_type, text_type, text_type, text_type, boolean_type, text_type, text_type) with open('www/live-data/insights.csv') as f: rows = list(csv.reader(f)) column_names = rows.pop(0) table = Table(rows, column_types, column_names) summary_definition = list(itertools.product(FACEBOOK_METRICS, SUMMARY_TYPES)) summary = table.aggregate('provider_type', summary_definition) count_grand_total = summary.columns['provider_type_count'].sum() summary = summary.compute('provider_type_count_pct', number_type, lambda x: (x['provider_type_count']/count_grand_total) * 100) summary = summary.order_by('provider_type') _write_summary_csv(summary, 'www/live-data/insights_summary.csv') for metric in FACEBOOK_METRICS: _generate_insights_histograms(metric, table, summary)
def test_limit_preserves_rows(self): table = Table(self.rows, self.column_types, self.column_names) table2 = table.limit(2) table3 = table2.limit(2) self.assertIsNot(table._data[0], table2._data[0]) self.assertIs(table2._data[0], table3._data[0])
def analyse_photo_efforts(): column_types = (number_type, text_type, text_type, text_type, boolean_type) with open('www/live-data/photo_efforts.csv') as f: rows = list(csv.reader(f)) column_names = rows.pop(0) table = Table(rows, column_types, column_names) homepage_summary = table.aggregate('on_homepage', (('duration', 'sum'),)) count_grand_total = homepage_summary.columns['on_homepage_count'].sum() homepage_summary = homepage_summary.compute('on_homepage_count_pct', number_type, lambda x: (x['on_homepage_count']/count_grand_total) * 100) count_grand_total = homepage_summary.columns['duration_sum'].sum() homepage_summary = homepage_summary.compute('duration_sum_pct', number_type, lambda x: (x['duration_sum']/count_grand_total) * 100) _write_summary_csv(homepage_summary, 'www/live-data/homepage_summary.csv') contribution_summary = table.aggregate('contribution', (('duration', 'sum'),)) contribution_summary = contribution_summary.order_by('contribution_count', reverse=True) contribution_summary = contribution_summary.compute('contribution_count_pct', number_type, lambda x: (x['contribution_count']/count_grand_total) * 100) contribution_summary = contribution_summary.compute('duration_sum_pct', number_type, lambda x: (x['duration_sum']/count_grand_total) * 100) _write_summary_csv(contribution_summary, 'www/live-data/contribution_summary.csv')
def analyse_insights(): """ generate reports from insights data """ column_types = (date_type, number_type, number_type, number_type, number_type, number_type, boolean_type, text_type, text_type, text_type, text_type, boolean_type, text_type, text_type) with open('www/live-data/insights.csv') as f: rows = list(csv.reader(f)) column_names = rows.pop(0) table = Table(rows, column_types, column_names) summary_definition = list( itertools.product(FACEBOOK_METRICS, SUMMARY_TYPES)) summary = table.aggregate('provider_type', summary_definition) count_grand_total = summary.columns['provider_type_count'].sum() summary = summary.compute( 'provider_type_count_pct', number_type, lambda x: (x['provider_type_count'] / count_grand_total) * 100) summary = summary.order_by('provider_type') _write_summary_csv(summary, 'www/live-data/insights_summary.csv') for metric in FACEBOOK_METRICS: _generate_insights_histograms(metric, table, summary)
def test_aggregeate_bad_column(self): table = Table(self.rows, self.column_types, self.column_names) with self.assertRaises(ColumnDoesNotExistError): table.aggregate('bad', (('one', 'sum'), )) with self.assertRaises(ColumnDoesNotExistError): table.aggregate('two', (('bad', 'sum'), ))
def test_column_names_immutable(self): column_names = ['one', 'two', 'three'] table = Table(self.rows, self.column_types, column_names) column_names[0] = 'five' self.assertEqual(table.get_column_names()[0], 'one')
def test_pearson_correlation(self): rows = ((-1, 0, 'a'), (0, 0, 'b'), (1, 3, 'c')) table = Table(rows, self.column_types, self.column_names) self.assertEqual(table.pearson_correlation('one', 'one'), Decimal('1')) self.assertAlmostEqual(table.pearson_correlation('one', 'two'), Decimal('3').sqrt() * Decimal('0.5'))
def test_limit(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.limit(2) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 2) self.assertSequenceEqual(new_table.rows[0], (1, 4, 'a')) self.assertSequenceEqual(new_table.columns['one'], (1, 2))
def test_order_by_reverse(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.order_by(lambda r: r['two'], reverse=True) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], (1, 4, 'a')) self.assertSequenceEqual(new_table.rows[1], (2, 3, 'b')) self.assertSequenceEqual(new_table.rows[2], (None, 2, 'c'))
def test_where(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.where(lambda r: r['one'] in (2, None)) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 2) self.assertSequenceEqual(new_table.rows[0], (2, 3, 'b')) self.assertSequenceEqual(new_table.columns['one'], (2, None))
def test_limit_slice_negative(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.limit(-2, step=-1) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 2) self.assertSequenceEqual(new_table.rows[0], (2, 3, 'b')) self.assertSequenceEqual(new_table.rows[1], (1, 4, 'a')) self.assertSequenceEqual(new_table.columns['one'], (2, 1))
def test_limit_step_only(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.limit(step=2) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 2) self.assertSequenceEqual(new_table.rows[0], (1, 4, 'a')) self.assertSequenceEqual(new_table.rows[1], (None, 2, 'c')) self.assertSequenceEqual(new_table.columns['one'], (1, None))
def test_compute_creates_rows(self): table = Table(self.rows, self.column_types, self.column_names) table2 = table.compute('new2', self.number_type, lambda r: r['one']) table3 = table2.compute('new3', self.number_type, lambda r: r['one']) self.assertIsNot(table._data[0], table2._data[0]) self.assertNotEqual(table._data[0], table2._data[0]) self.assertIsNot(table2._data[0], table3._data[0]) self.assertNotEqual(table2._data[0], table3._data[0]) self.assertSequenceEqual(table._data[0], (1, 4, 'a'))
def test_pearson_correlation_zero(self): rows = ( (-1, 3, 'a'), (0, 3, 'b'), (1, 3, 'c') ) table = Table(rows, self.column_types, self.column_names) self.assertEqual(table.pearson_correlation('one', 'two'), Decimal('0'))
def test_fork_preserves_data(self): table = Table(self.rows, self.column_types, self.column_names) table2 = table._fork(table.rows) self.assertIs(table.rows[0], table2._data[0]) self.assertIs(table.rows[1], table2._data[1]) self.assertIs(table.rows[2], table2._data[2]) self.assertIs(table.rows[0], table2.rows[0]) self.assertIs(table.rows[1], table2.rows[1]) self.assertIs(table.rows[2], table2.rows[2])
def setUp(self): self.rows = (('a', 2, 3, 4), (None, 3, 5, None), ('a', 2, 4, None), ('b', 3, 4, None)) self.number_type = NumberType() self.text_type = TextType() self.column_types = (self.text_type, self.number_type, self.number_type, self.number_type) self.column_names = ('one', 'two', 'three', 'four') self.table = Table(self.rows, self.column_types, self.column_names)
def test_pearson_correlation(self): rows = ( (-1, 0, 'a'), (0, 0, 'b'), (1, 3, 'c') ) table = Table(rows, self.column_types, self.column_names) self.assertEqual(table.pearson_correlation('one', 'one'), Decimal('1')) self.assertAlmostEqual(table.pearson_correlation('one', 'two'), Decimal('3').sqrt() * Decimal('0.5'))
def test_distinct_column(self): rows = ((1, 2, 'a'), (2, None, None), (1, 1, 'c'), (1, None, None)) table = Table(rows, self.column_types, self.column_names) new_table = table.distinct('one') self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 2) self.assertSequenceEqual(new_table.rows[0], (1, 2, 'a')) self.assertSequenceEqual(new_table.rows[1], (2, None, None)) self.assertSequenceEqual(new_table.columns['one'], (1, 2))
def test_order_by_func(self): rows = ((1, 2, 'a'), (2, 1, 'b'), (1, 1, 'c')) table = Table(rows, self.column_types, self.column_names) new_table = table.order_by(lambda r: (r['one'], r['two'])) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], (1, 1, 'c')) self.assertSequenceEqual(new_table.rows[1], (1, 2, 'a')) self.assertSequenceEqual(new_table.rows[2], (2, 1, 'b'))
def test_chain_select_where(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.select(('one', 'two')).where(lambda r: r['two'] == 3) self.assertEqual(len(new_table.rows), 1) self.assertSequenceEqual(new_table.rows[0], (2, 3)) self.assertEqual(len(new_table.columns), 2) self.assertSequenceEqual(new_table._column_types, (self.number_type, self.number_type)) self.assertEqual(new_table._column_names, ('one', 'two')) self.assertSequenceEqual(new_table.columns['one'], (2,))
def test_aggregate_two_ops(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.aggregate('one', (('two', 'sum'), ('two', 'mean'))) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertEqual(len(new_table.columns), 4) self.assertSequenceEqual(new_table._column_names, ('one', 'one_count', 'two_sum', 'two_mean')) self.assertSequenceEqual(new_table.rows[0], ('a', 2, 4, 2)) self.assertSequenceEqual(new_table.rows[1], (None, 1, 3, 3)) self.assertSequenceEqual(new_table.rows[2], ('b', 1, 3, 3))
def test_chain_select_where(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.select(('one', 'two')).where(lambda r: r['two'] == 3) self.assertEqual(len(new_table.rows), 1) self.assertSequenceEqual(new_table.rows[0], (2, 3)) self.assertEqual(len(new_table.columns), 2) self.assertSequenceEqual(new_table._column_types, (self.number_type, self.number_type)) self.assertEqual(new_table._column_names, ('one', 'two')) self.assertSequenceEqual(new_table.columns['one'], (2, ))
def test_distinct_func(self): rows = ((1, 2, 'a'), (2, None, None), (1, 1, 'c'), (1, None, None)) table = Table(rows, self.column_types, self.column_names) new_table = table.distinct(lambda row: (row['two'], row['three'])) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], (1, 2, 'a')) self.assertSequenceEqual(new_table.rows[1], (2, None, None)) self.assertSequenceEqual(new_table.rows[2], (1, 1, 'c')) self.assertSequenceEqual(new_table.columns['one'], (1, 2, 1))
def test_mad_outliers_reject(self): rows = [ (50, 4, 'a'), ] * 10 rows.append((200, 1, 'b')) table = Table(rows, self.column_types, self.column_names) new_table = table.mad_outliers('one', reject=True) self.assertEqual(len(new_table.rows), 1) self.assertSequenceEqual(new_table.columns['one'], (200,))
def test_order_by_nulls(self): rows = ((1, 2, None), (2, None, None), (1, 1, 'c'), (1, None, 'a')) table = Table(rows, self.column_types, self.column_names) new_table = table.order_by('two') self.assertSequenceEqual(new_table.columns['two'], (1, 2, None, None)) new_table = table.order_by('three') self.assertSequenceEqual(new_table.columns['three'], ('a', 'c', None, None))
def test_mad_outliers(self): rows = [ (50, 4, 'a'), ] * 10 rows.append((200, 1, 'b')) table = Table(rows, self.column_types, self.column_names) new_table = table.mad_outliers('one') self.assertEqual(len(new_table.rows), 10) self.assertNotIn(200, new_table.columns['one'])
def test_mad_outliers_reject(self): rows = [ (50, 4, 'a'), ] * 10 rows.append((200, 1, 'b')) table = Table(rows, self.column_types, self.column_names) new_table = table.mad_outliers('one', reject=True) self.assertEqual(len(new_table.rows), 1) self.assertSequenceEqual(new_table.columns['one'], (200, ))
def analyse_effort_and_analytics(): column_types = (text_type, text_type, number_type, number_type, boolean_type) with open('www/live-data/raw_effort_and_analytics.csv') as f: rows = list(csv.reader(f)) column_names = rows.pop(0) table = Table(rows, column_types, column_names) #import ipdb; ipdb.set_trace(); summary = table.aggregate('visuals_contributed', (('pageviews', 'sum'), ('pageviews', 'mean'), ('pageviews', 'median'), ('sessions', 'sum'), ('sessions', 'mean'), ('sessions', 'median'))) for row in summary.rows: print row _write_summary_csv(summary, 'www/live-data/effort_and_analytics_summary.csv')
def setUp(self): self.left_rows = ((1, 4, 'a'), (2, 3, 'b'), (None, 2, 'c')) self.right_rows = ((1, 4, 'a'), (2, 3, 'b'), (None, 2, 'c')) self.number_type = NumberType() self.text_type = TextType() self.column_types = (self.number_type, self.number_type, self.text_type) self.left = Table(self.left_rows, self.column_types, ('one', 'two', 'three')) self.right = Table(self.right_rows, self.column_types, ('four', 'five', 'six'))
def test_create_table_args(self): with self.assertRaises(ValueError): Table(self.rows, [ self.number_type, self.number_type, self.text_type, self.text_type ], self.column_names) with self.assertRaises(ValueError): Table(self.rows, self.column_types, ['one', 'two', 'three', 'four']) with self.assertRaises(ValueError): Table(self.rows, [self.number_type, self.number_type], ['one', 'two'])
def test_group_by(self): table = Table(self.rows, self.column_types, self.column_names) new_tables = table.group_by('one') self.assertEqual(len(new_tables), 3) self.assertIn('a', new_tables.keys()) self.assertIn('b', new_tables.keys()) self.assertIn(None, new_tables.keys()) self.assertSequenceEqual(new_tables['a'].columns['one'], ('a', 'a')) self.assertSequenceEqual(new_tables['b'].columns['one'], ('b', )) self.assertSequenceEqual(new_tables[None].columns['one'], (None, ))
def test_group_by(self): table = Table(self.rows, self.column_types, self.column_names) new_tables = table.group_by('one') self.assertEqual(len(new_tables), 3) self.assertIn('a', new_tables.keys()) self.assertIn('b', new_tables.keys()) self.assertIn(None, new_tables.keys()) self.assertSequenceEqual(new_tables['a'].columns['one'], ('a', 'a')) self.assertSequenceEqual(new_tables['b'].columns['one'], ('b',)) self.assertSequenceEqual(new_tables[None].columns['one'], (None,))
def test_order_by(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.order_by('two') self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], (None, 2, 'c')) self.assertSequenceEqual(new_table.rows[1], (2, 3, 'b')) self.assertSequenceEqual(new_table.rows[2], (1, 4, 'a')) # Verify old table not changed self.assertSequenceEqual(table.rows[0], (1, 4, 'a')) self.assertSequenceEqual(table.rows[1], (2, 3, 'b')) self.assertSequenceEqual(table.rows[2], (None, 2, 'c'))
def test_select(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.select(('three',)) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], ('a',)) self.assertSequenceEqual(new_table.rows[1], ('b',)) self.assertSequenceEqual(new_table.rows[2], ('c',)) self.assertEqual(len(new_table.columns), 1) self.assertSequenceEqual(new_table._column_types, (self.text_type,)) self.assertSequenceEqual(new_table._column_names, ('three',)) self.assertSequenceEqual(new_table.columns['three'], ('a', 'b', 'c'))
def test_select(self): table = Table(self.rows, self.column_types, self.column_names) new_table = table.select(('three', )) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], ('a', )) self.assertSequenceEqual(new_table.rows[1], ('b', )) self.assertSequenceEqual(new_table.rows[2], ('c', )) self.assertEqual(len(new_table.columns), 1) self.assertSequenceEqual(new_table._column_types, (self.text_type, )) self.assertSequenceEqual(new_table._column_names, ('three', )) self.assertSequenceEqual(new_table.columns['three'], ('a', 'b', 'c'))
def test_order_by_func(self): rows = ( (1, 2, 'a'), (2, 1, 'b'), (1, 1, 'c') ) table = Table(rows, self.column_types, self.column_names) new_table = table.order_by(lambda r: (r['one'], r['two'])) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], (1, 1, 'c')) self.assertSequenceEqual(new_table.rows[1], (1, 2, 'a')) self.assertSequenceEqual(new_table.rows[2], (2, 1, 'b'))
def test_create_table(self): table = Table(self.rows, self.column_types, self.column_names) self.assertEqual(len(table.rows), 3) self.assertSequenceEqual(table.rows[0], (1, 4, 'a')) self.assertSequenceEqual(table.rows[1], (2, 3, 'b')) self.assertSequenceEqual(table.rows[2], (None, 2, 'c'))
def test_order_by_nulls(self): rows = ( (1, 2, None), (2, None, None), (1, 1, 'c'), (1, None, 'a') ) table = Table(rows, self.column_types, self.column_names) new_table = table.order_by('two') self.assertSequenceEqual(new_table.columns['two'], (1, 2, None, None)) new_table = table.order_by('three') self.assertSequenceEqual(new_table.columns['three'], ('a', 'c', None, None))
def test_distinct_column(self): rows = ( (1, 2, 'a'), (2, None, None), (1, 1, 'c'), (1, None, None) ) table = Table(rows, self.column_types, self.column_names) new_table = table.distinct('one') self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 2) self.assertSequenceEqual(new_table.rows[0], (1, 2, 'a')) self.assertSequenceEqual(new_table.rows[1], (2, None, None)) self.assertSequenceEqual(new_table.columns['one'], (1, 2))
def setUp(self): self.rows = ((1, 2, 'a'), (2, 3, 'b'), (None, 4, 'c')) self.column_names = ('one', 'two', 'three') self.number_type = NumberType() self.text_type = TextType() self.column_types = (self.number_type, self.number_type, self.text_type) self.table = Table(self.rows, self.column_types, self.column_names)
def test_count(self): rows = ((1, 2, 'a'), (2, 3, 'b'), (None, 4, 'c'), (1, 2, 'a'), (1, 2, 'a')) table = Table(rows, self.column_types, self.column_names) self.assertEqual(table.columns['one'].count(1), 3) self.assertEqual(table.columns['one'].count(4), 0) self.assertEqual(table.columns['one'].count(None), 1)
def test_distinct_func(self): rows = ( (1, 2, 'a'), (2, None, None), (1, 1, 'c'), (1, None, None) ) table = Table(rows, self.column_types, self.column_names) new_table = table.distinct(lambda row: (row['two'], row['three'])) self.assertIsNot(new_table, table) self.assertEqual(len(new_table.rows), 3) self.assertSequenceEqual(new_table.rows[0], (1, 2, 'a')) self.assertSequenceEqual(new_table.rows[1], (2, None, None)) self.assertSequenceEqual(new_table.rows[2], (1, 1, 'c')) self.assertSequenceEqual(new_table.columns['one'], (1, 2, 1))
def analyse_photo_efforts_fb(): column_types = (number_type, text_type, text_type, text_type, boolean_type) with open('www/live-data/photo_efforts_fb.csv') as f: rows = list(csv.reader(f)) column_names = rows.pop(0) table = Table(rows, column_types, column_names) facebook_summary = table.aggregate('on_facebook', (('duration', 'sum'),)) count_grand_total = facebook_summary.columns['on_facebook_count'].sum() facebook_summary = facebook_summary.compute('on_facebook_count_pct', number_type, lambda x: (x['on_facebook_count']/count_grand_total) * 100) count_grand_total = facebook_summary.columns['duration_sum'].sum() facebook_summary = facebook_summary.compute('duration_sum_pct', number_type, lambda x: (x['duration_sum']/count_grand_total) * 100) _write_summary_csv(facebook_summary, 'www/live-data/facebook_summary.csv')
def setUp(self): self.rows = ( ('a', 2, 3, 4), (None, 3, 5, None), ('a', 2, 4, None), ('b', 3, 4, None) ) self.number_type = NumberType() self.text_type = TextType() self.column_types = (self.text_type, self.number_type, self.number_type, self.number_type) self.column_names = ('one', 'two', 'three', 'four') self.table = Table(self.rows, self.column_types, self.column_names)