class TestAbstractDatasets(TestBase):

    NUM_COLS = 15
    NUM_ROWS = 19

    def setUp(self):
        TestBase.setUp(self)
        self.controller = Datasets()
        self._file_name = 'good_eats.csv'
        self._update_file_name = 'good_eats_update.json'
        self._update_check_file_path = '%sgood_eats_update_values.json' % (
            self.FIXTURE_PATH)
        self.default_formulae = [
            'amount',
            'amount + 1',
            'amount - 5',
        ]

    def _put_row_updates(self, dataset_id=None, file_name=None, validate=True):
        if not dataset_id:
            dataset_id = self.dataset_id

        if not file_name:
            file_name = self._update_file_name

        update = open('%s%s' % (self.FIXTURE_PATH, file_name), 'r').read()
        result = json.loads(self.controller.update(dataset_id=dataset_id,
                                                   update=update))

        if validate:
            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)

        # set up the (default) values to test against
        with open(self._update_check_file_path, 'r') as f:
            self._update_values = json.loads(f.read())

    def _load_schema(self):
        return json.loads(
            self.controller.info(self.dataset_id))[Dataset.SCHEMA]

    def _check_dframes_are_equal(self, dframe1, dframe2):
        rows1 = comparable(dframe1)
        rows2 = comparable(dframe2)

        self.__check_dframe_is_subset(rows1, rows2)
        self.__check_dframe_is_subset(rows2, rows1)

    def __check_dframe_is_subset(self, rows1, rows2):
        for row in rows1:
            self.assertTrue(row in rows2,
                            '\nrow:\n%s\n\nnot in rows2:\n%s' % (row, rows2))

    def _post_calculations(self, formulae=[], group=None):
        schema = self._load_schema()
        controller = Calculations()

        for idx, formula in enumerate(formulae):
            name = 'calc_%d' % idx if not schema or\
                formula in schema.keys() else formula

            controller.create(self.dataset_id, formula=formula, name=name,
                              group=group)

    def _test_summary_built(self, result):
        # check that summary is created
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

        results = self.controller.summary(
            self.dataset_id,
            select=self.controller.SELECT_ALL_FOR_SUMMARY)

        return self._test_summary_results(results)

    def _test_summary_results(self, results):
        results = json.loads(results)
        self.assertTrue(isinstance(results, dict))
        return results

    def _test_aggregations(self, groups=['']):
        results = json.loads(self.controller.aggregations(self.dataset_id))
        self.assertTrue(isinstance(results, dict))
        self.assertEqual(len(results.keys()), len(groups))
        self.assertEqual(results.keys(), groups)
        linked_dataset_id = results[groups[0]]
        self.assertTrue(isinstance(linked_dataset_id, basestring))

        # inspect linked dataset
        return json.loads(self.controller.show(linked_dataset_id))

    def _test_summary_no_group(self, results, dataset_id=None, group=None):
        if not dataset_id:
            dataset_id = self.dataset_id

        group = [group] if group else []
        result_keys = results.keys()

        # minus the column that we are grouping on
        self.assertEqual(len(result_keys), self.NUM_COLS - len(group))

        columns = [col for col in
                   self.get_data(self._file_name).columns.tolist()
                   if not col in [MONGO_ID] + group]

        dataset = Dataset.find_one(dataset_id)
        labels_to_slugs = dataset.schema.labels_to_slugs

        for col in columns:
            slug = labels_to_slugs[col]
            self.assertTrue(slug in result_keys,
                            'col (slug): %s in: %s' % (slug, result_keys))
            self.assertTrue(SUMMARY in results[slug].keys())
Exemple #2
0
class TestAbstractDatasets(TestBase):

    NUM_COLS = 15
    NUM_ROWS = 19

    def setUp(self):
        TestBase.setUp(self)
        self.controller = Datasets()
        self._file_name = 'good_eats.csv'
        self._update_file_name = 'good_eats_update.json'
        self._update_check_file_path = '%sgood_eats_update_values.json' % (
            self.FIXTURE_PATH)
        self.default_formulae = [
            'amount',
            'amount + 1',
            'amount - 5',
        ]

    def _put_row_updates(self, dataset_id=None, file_name=None, validate=True):
        if not dataset_id:
            dataset_id = self.dataset_id

        if not file_name:
            file_name = self._update_file_name

        update = open('%s%s' % (self.FIXTURE_PATH, file_name), 'r').read()
        result = json.loads(self.controller.update(dataset_id=dataset_id,
                                                   update=update))

        if validate:
            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)

        # set up the (default) values to test against
        with open(self._update_check_file_path, 'r') as f:
            self._update_values = json.loads(f.read())

    def _load_schema(self):
        return json.loads(
            self.controller.info(self.dataset_id))[Dataset.SCHEMA]

    def _check_dframes_are_equal(self, dframe1, dframe2):
        rows1 = comparable(dframe1)
        rows2 = comparable(dframe2)

        self.__check_dframe_is_subset(rows1, rows2)
        self.__check_dframe_is_subset(rows2, rows1)

    def __check_dframe_is_subset(self, rows1, rows2):
        for row in rows1:
            self.assertTrue(row in rows2,
                            '\nrow:\n%s\n\nnot in rows2:\n%s' % (row, rows2))

    def _post_calculations(self, formulae=[], group=None):
        schema = self._load_schema()
        controller = Calculations()

        for idx, formula in enumerate(formulae):
            name = 'calc_%d' % idx if not schema or\
                formula in schema.keys() else formula

            controller.create(self.dataset_id, formula=formula, name=name,
                              group=group)

    def _test_summary_built(self, result):
        # check that summary is created
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

        results = self.controller.summary(
            self.dataset_id,
            select=self.controller.SELECT_ALL_FOR_SUMMARY)

        return self._test_summary_results(results)

    def _test_summary_results(self, results):
        results = json.loads(results)
        self.assertTrue(isinstance(results, dict))
        return results

    def _test_aggregations(self, groups=['']):
        results = json.loads(self.controller.aggregations(self.dataset_id))
        self.assertTrue(isinstance(results, dict))
        self.assertEqual(len(results.keys()), len(groups))
        self.assertEqual(results.keys(), groups)
        linked_dataset_id = results[groups[0]]
        self.assertTrue(isinstance(linked_dataset_id, basestring))

        # inspect linked dataset
        return json.loads(self.controller.show(linked_dataset_id))

    def _test_summary_no_group(self, results, dataset_id=None, group=None):
        if not dataset_id:
            dataset_id = self.dataset_id

        group = [group] if group else []
        result_keys = results.keys()

        # minus the column that we are grouping on
        self.assertEqual(len(result_keys), self.NUM_COLS - len(group))

        columns = [col for col in
                   self.get_data(self._file_name).columns.tolist()
                   if not col in [MONGO_ID] + group]

        dataset = Dataset.find_one(dataset_id)
        labels_to_slugs = dataset.schema.labels_to_slugs

        for col in columns:
            slug = labels_to_slugs[col]
            self.assertTrue(slug in result_keys,
                            'col (slug): %s in: %s' % (slug, result_keys))
            self.assertTrue(SUMMARY in results[slug].keys())
class TestProfile(TestBase):

    TEST_CASE_SIZES = {
        'tiny': (1, 1),
        'small': (2, 2),
        'large': (4, 40),
    }

    def setUp(self):
        TestBase.setUp(self)
        self.datasets = Datasets()
        self.tmp_file = NamedTemporaryFile(delete=False)

    def tearDown(self):
        os.unlink(self.tmp_file.name)

    def _expand_width(self, df, exponent):
        for i in xrange(0, exponent):
            other = df.rename(columns={
                col: '%s-%s' % (col, idx)
                for (idx, col) in enumerate(df.columns)
            })
            df = df.join(other)
            df.rename(columns={
                col: str(idx)
                for (idx, col) in enumerate(df.columns)
            },
                      inplace=True)
        return df

    def _grow_test_data(self, dataset_name, width_exp, length_factor):
        df = self.get_data(dataset_name)
        df = self._expand_width(df, width_exp)
        return concat([df] * length_factor)

    def test_tiny_profile(self):
        self._test_profile('tiny')

    def test_small_profile(self):
        self._test_profile('small')

    def test_large_profile(self):
        self._test_profile('large')

    @run_profiler
    def _test_profile(self, size):
        print 'bamboo/bamboo: %s' % size
        self._test_create_data(*self.TEST_CASE_SIZES[size])
        print 'saving dataset'
        self._test_save_dataset()
        self._test_get_info()
        self._test_get_summary()
        self._test_get_summary_with_group('province')
        self._test_get_summary_with_group('school_zone')

    def _test_create_data(self, width_exp, length_factor):
        self.data = self._grow_test_data('kenya_secondary_schools_2007.csv',
                                         width_exp, length_factor)
        print 'bamboo/bamboo rows: %s, columns: %s' % (len(
            self.data), len(self.data.columns))

    def _test_save_dataset(self):
        self.data.to_csv(self.tmp_file)
        self.tmp_file.close()
        mock_uploaded_file = MockUploadedFile(open(self.tmp_file.name, 'r'))
        result = json.loads(self.datasets.create(csv_file=mock_uploaded_file))
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

    def _test_get_info(self):
        result = json.loads(self.datasets.info(self.dataset_id))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary(self):
        result = json.loads(
            self.datasets.summary(self.dataset_id,
                                  select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary_with_group(self, group):
        result = json.loads(
            self.datasets.summary(self.dataset_id,
                                  group=group,
                                  select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))
Exemple #4
0
class TestProfile(TestBase):

    TEST_CASE_SIZES = {
        'tiny': (1, 1),
        'small': (2, 2),
        'large': (4, 40),
    }

    def setUp(self):
        TestBase.setUp(self)
        self.datasets = Datasets()
        self.tmp_file = NamedTemporaryFile(delete=False)

    def tearDown(self):
        os.unlink(self.tmp_file.name)

    def _expand_width(self, df, exponent):
        for i in xrange(0, exponent):
            other = df.rename(
                columns={col: '%s-%s' % (col, idx) for (idx, col) in
                         enumerate(df.columns)})
            df = df.join(other)
            df.rename(columns={col: str(idx) for (idx, col) in
                      enumerate(df.columns)}, inplace=True)
        return df

    def _grow_test_data(self, dataset_name, width_exp, length_factor):
        df = self.get_data(dataset_name)
        df = self._expand_width(df, width_exp)
        return concat([df] * length_factor)

    def test_tiny_profile(self):
        self._test_profile('tiny')

    def test_small_profile(self):
        self._test_profile('small')

    def test_large_profile(self):
        self._test_profile('large')

    @run_profiler
    def _test_profile(self, size):
        print 'bamboo/bamboo: %s' % size
        self._test_create_data(*self.TEST_CASE_SIZES[size])
        print 'saving dataset'
        self._test_save_dataset()
        self._test_get_info()
        self._test_get_summary()
        self._test_get_summary_with_group('province')
        self._test_get_summary_with_group('school_zone')

    def _test_create_data(self, width_exp, length_factor):
        self.data = self._grow_test_data(
            'kenya_secondary_schools_2007.csv', width_exp, length_factor)
        print 'bamboo/bamboo rows: %s, columns: %s' % (
            len(self.data), len(self.data.columns))

    def _test_save_dataset(self):
        self.data.to_csv(self.tmp_file)
        self.tmp_file.close()
        mock_uploaded_file = MockUploadedFile(open(self.tmp_file.name, 'r'))
        result = json.loads(self.datasets.create(csv_file=mock_uploaded_file))
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

    def _test_get_info(self):
        result = json.loads(self.datasets.info(self.dataset_id))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary(self):
        result = json.loads(self.datasets.summary(
            self.dataset_id,
            select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary_with_group(self, group):
        result = json.loads(self.datasets.summary(
            self.dataset_id, group=group,
            select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))