def setUp(self):
     TestBase.setUp(self)
     self.controller = Calculations()
     self.dataset_controller = Datasets()
     self.dataset_id = None
     self.formula = 'amount + gps_alt'
     self.name = 'test'
Exemple #2
0
 def setUp(self):
     TestBase.setUp(self)
     self.controller = Datasets()
     self._file_name = 'good_eats.csv'
     self._update_file_name = 'good_eats_update.json'
     self._update_check_file_path = '%sgood_eats_update_values.json' % (
         self.FIXTURE_PATH)
     self.default_formulae = [
         'amount',
         'amount + 1',
         'amount - 5',
     ]
Exemple #3
0
def connect_routes(dispatcher):
    """This function takes the dispatcher and attaches the routes.

    :param dispatcher: The CherryPy dispatcher.
    """
    # controller instances map
    controllers = {
        'root': Root(),
        'calculations': Calculations(),
        'datasets': Datasets(),
        'version': Version(),
    }

    # map them into args to dispatcher
    dictify = lambda x: dict(zip(['name', 'conditions', 'route', 'controller', 'action'], x))
    route_case = {
        'conditions': lambda v: dict(method=v),
        'controller': lambda v: controllers[v],
    }
    kwarg_map = lambda d: {
        k: route_case.get(k, lambda v: v)(v) for k, v in d.iteritems()
    }

    routes = [kwarg_map(dictify(route)) for route in ROUTES + options()]

    # attach them
    for route in routes:
        dispatcher.connect(**route)
 def setUp(self):
     TestBase.setUp(self)
     self.controller = Datasets()
     self._file_name = 'good_eats.csv'
     self._update_file_name = 'good_eats_update.json'
     self._update_check_file_path = '%sgood_eats_update_values.json' % (
         self.FIXTURE_PATH)
     self.default_formulae = [
         'amount',
         'amount + 1',
         'amount - 5',
     ]
    def test_create_update_summary(self):
        dataset_id = self._post_file()
        Datasets().summary(
            dataset_id,
            select=Datasets.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertTrue(isinstance(dataset.stats, dict))
        self.assertTrue(isinstance(dataset.stats[Dataset.ALL], dict))

        self.__post_formula()

        # stats should have new column for calculation
        dataset = Dataset.find_one(self.dataset_id)
        stats = dataset.stats.get(Dataset.ALL)
        self.assertTrue(self.name in stats.keys())
class TestAbstractDatasets(TestBase):

    NUM_COLS = 15
    NUM_ROWS = 19

    def setUp(self):
        TestBase.setUp(self)
        self.controller = Datasets()
        self._file_name = 'good_eats.csv'
        self._update_file_name = 'good_eats_update.json'
        self._update_check_file_path = '%sgood_eats_update_values.json' % (
            self.FIXTURE_PATH)
        self.default_formulae = [
            'amount',
            'amount + 1',
            'amount - 5',
        ]

    def _put_row_updates(self, dataset_id=None, file_name=None, validate=True):
        if not dataset_id:
            dataset_id = self.dataset_id

        if not file_name:
            file_name = self._update_file_name

        update = open('%s%s' % (self.FIXTURE_PATH, file_name), 'r').read()
        result = json.loads(self.controller.update(dataset_id=dataset_id,
                                                   update=update))

        if validate:
            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)

        # set up the (default) values to test against
        with open(self._update_check_file_path, 'r') as f:
            self._update_values = json.loads(f.read())

    def _load_schema(self):
        return json.loads(
            self.controller.info(self.dataset_id))[Dataset.SCHEMA]

    def _check_dframes_are_equal(self, dframe1, dframe2):
        rows1 = comparable(dframe1)
        rows2 = comparable(dframe2)

        self.__check_dframe_is_subset(rows1, rows2)
        self.__check_dframe_is_subset(rows2, rows1)

    def __check_dframe_is_subset(self, rows1, rows2):
        for row in rows1:
            self.assertTrue(row in rows2,
                            '\nrow:\n%s\n\nnot in rows2:\n%s' % (row, rows2))

    def _post_calculations(self, formulae=[], group=None):
        schema = self._load_schema()
        controller = Calculations()

        for idx, formula in enumerate(formulae):
            name = 'calc_%d' % idx if not schema or\
                formula in schema.keys() else formula

            controller.create(self.dataset_id, formula=formula, name=name,
                              group=group)

    def _test_summary_built(self, result):
        # check that summary is created
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

        results = self.controller.summary(
            self.dataset_id,
            select=self.controller.SELECT_ALL_FOR_SUMMARY)

        return self._test_summary_results(results)

    def _test_summary_results(self, results):
        results = json.loads(results)
        self.assertTrue(isinstance(results, dict))
        return results

    def _test_aggregations(self, groups=['']):
        results = json.loads(self.controller.aggregations(self.dataset_id))
        self.assertTrue(isinstance(results, dict))
        self.assertEqual(len(results.keys()), len(groups))
        self.assertEqual(results.keys(), groups)
        linked_dataset_id = results[groups[0]]
        self.assertTrue(isinstance(linked_dataset_id, basestring))

        # inspect linked dataset
        return json.loads(self.controller.show(linked_dataset_id))

    def _test_summary_no_group(self, results, dataset_id=None, group=None):
        if not dataset_id:
            dataset_id = self.dataset_id

        group = [group] if group else []
        result_keys = results.keys()

        # minus the column that we are grouping on
        self.assertEqual(len(result_keys), self.NUM_COLS - len(group))

        columns = [col for col in
                   self.get_data(self._file_name).columns.tolist()
                   if not col in [MONGO_ID] + group]

        dataset = Dataset.find_one(dataset_id)
        labels_to_slugs = dataset.schema.labels_to_slugs

        for col in columns:
            slug = labels_to_slugs[col]
            self.assertTrue(slug in result_keys,
                            'col (slug): %s in: %s' % (slug, result_keys))
            self.assertTrue(SUMMARY in results[slug].keys())
Exemple #7
0
 def test_dataset_controller_options(self):
     controller = Datasets()
     controller.options('dataset_id')
class TestCalculations(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.controller = Calculations()
        self.dataset_controller = Datasets()
        self.dataset_id = None
        self.formula = 'amount + gps_alt'
        self.name = 'test'

    def __post_formula(self, formula=None, name=None):
        if not formula:
            formula = self.formula
        if not name:
            name = self.name

        if not self.dataset_id:
            self.dataset_id = self._post_file()

        return self.controller.create(self.dataset_id, formula, name)

    def __post_update(self, dataset_id, update):
        return json.loads(self.dataset_controller.update(
            dataset_id=dataset_id, update=json.dumps(update)))

    def __wait_for_calculation_ready(self, dataset_id, name):
        while True:
            calculation = Calculation.find_one(dataset_id, name)

            if calculation.is_ready:
                break

            sleep(self.SLEEP_DELAY)

    def __test_error(self, response, error_text=None):
        response = json.loads(response)

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.ERROR in response)

        if not error_text:
            error_text = 'Must provide'

        self.assertTrue(error_text in response[self.controller.ERROR])

    def __test_create_from_json(self, json_filename, non_agg_cols=1, ex_len=1,
                                group=None):
        json_filepath = 'tests/fixtures/%s' % json_filename
        mock_uploaded_file = self._file_mock(json_filepath)
        dataset = Dataset.find_one(self.dataset_id)
        prev_columns = len(dataset.dframe().columns)
        response = json.loads(self.controller.create(
            self.dataset_id, json_file=mock_uploaded_file, group=group))

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[Dataset.ID])

        self.assertEqual(
            ex_len, len(json.loads(self.controller.show(self.dataset_id))))
        self.assertEqual(
            prev_columns + non_agg_cols,
            len(dataset.reload().dframe().columns))

        return dataset

    def __verify_create(self, response):
        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertEqual(response[Dataset.ID], self.dataset_id)

        self.__wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)
        dframe = dataset.dframe()

        self.assertTrue(self.name in dataset.schema.keys())
        self.assertTrue(self.name in dframe.columns)
        self.assertEqual(TestAbstractDatasets.NUM_ROWS, len(dframe))
        self.assertEqual(TestAbstractDatasets.NUM_ROWS,
                         dataset.info()[Dataset.NUM_ROWS])

    def test_show(self):
        self.__post_formula()
        response = self.controller.show(self.dataset_id)

        self.assertTrue(isinstance(json.loads(response), list))

    def test_create(self):
        response = json.loads(self.__post_formula())
        self.__verify_create(response)

    @requires_async
    def test_create_async_not_ready(self):
        self.dataset_id = self._create_dataset_from_url(
            '%s%s' % (self._local_fixture_prefix(), 'good_eats_huge.csv'))
        response = json.loads(self.__post_formula())
        dataset = Dataset.find_one(self.dataset_id)

        self.assertFalse(dataset.is_ready)
        self.assertTrue(isinstance(response, dict))
        self.assertFalse(DATASET_ID in response)

        self._wait_for_dataset_state(self.dataset_id)

        self.assertFalse(self.name in dataset.schema.keys())

    @requires_async
    def test_create_async_sets_calculation_status(self):
        self.dataset_id = self._create_dataset_from_url(
            '%s%s' % (self._local_fixture_prefix(), 'good_eats_huge.csv'))

        self._wait_for_dataset_state(self.dataset_id)

        response = json.loads(self.__post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertEqual(response[Dataset.ID], self.dataset_id)

        response = json.loads(self.controller.show(self.dataset_id))[0]

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(Calculation.STATE in response)
        self.assertEqual(response[Calculation.STATE],
                         Calculation.STATE_PENDING)

        self.__wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.schema.keys())

    @requires_async
    def test_create_async(self):
        self.dataset_id = self._post_file()

        self._wait_for_dataset_state(self.dataset_id)

        response = json.loads(self.__post_formula())
        self.__verify_create(response)

    def test_create_invalid_formula(self):
        dataset_id = self._post_file()
        result = json.loads(
            self.controller.create(dataset_id, '=NON_EXIST', self.name))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Datasets.ERROR in result.keys())

    def test_create_update_summary(self):
        dataset_id = self._post_file()
        Datasets().summary(
            dataset_id,
            select=Datasets.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertTrue(isinstance(dataset.stats, dict))
        self.assertTrue(isinstance(dataset.stats[Dataset.ALL], dict))

        self.__post_formula()

        # stats should have new column for calculation
        dataset = Dataset.find_one(self.dataset_id)
        stats = dataset.stats.get(Dataset.ALL)
        self.assertTrue(self.name in stats.keys())

    def test_delete_nonexistent_calculation(self):
        dataset_id = self._post_file()
        result = json.loads(self.controller.delete(dataset_id, self.name))

        self.assertTrue(Calculations.ERROR in result)

    def test_delete(self):
        self.__post_formula()
        result = json.loads(self.controller.delete(self.dataset_id, self.name))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.schema.labels_to_slugs)

    def test_delete_calculation_not_in_dataset(self):
        self.__post_formula()

        # Remove column from dataset
        dataset = Dataset.find_one(self.dataset_id)
        dataset.delete_columns([self.name])

        result = json.loads(self.controller.delete(self.dataset_id, self.name))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.schema.labels_to_slugs)

    def test_delete_update_summary(self):
        self.__post_formula()

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name in dataset.stats.get(Dataset.ALL).keys())

        json.loads(self.controller.delete(self.dataset_id, self.name))

        dataset = Dataset.find_one(self.dataset_id)
        self.assertTrue(self.name not in dataset.stats.get(Dataset.ALL).keys())

    def test_show_jsonp(self):
        self.__post_formula()
        results = self.controller.show(self.dataset_id, callback='jsonp')

        self.assertEqual('jsonp(', results[0:6])
        self.assertEqual(')', results[-1])

    def test_create_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        response = json.loads(self.__post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertEqual(response[Dataset.ID], self.dataset_id)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue('' in dataset.aggregated_datasets_dict.keys())

    def test_delete_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        json.loads(self.__post_formula())

        result = json.loads(
            self.controller.delete(self.dataset_id, self.name, ''))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        agg_dataset = dataset.aggregated_dataset('')

        self.assertTrue(self.name not in agg_dataset.schema.labels_to_slugs)

    def test_error_on_delete_calculation_with_dependency(self):
        self.__post_formula()
        dep_name = self.name
        self.formula = dep_name
        self.name = 'test1'
        response = json.loads(self.__post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)

        result = json.loads(
            self.controller.delete(self.dataset_id, dep_name, ''))

        self.assertTrue(AbstractController.ERROR in result)
        self.assertTrue('depend' in result[AbstractController.ERROR])

    def test_create_multiple(self):
        self.dataset_id = self._post_file()
        self.__test_create_from_json(
            'good_eats.calculations.json', non_agg_cols=2, ex_len=2)

    def test_create_multiple_ignore_group(self):
        self.dataset_id = self._post_file()
        dataset = self.__test_create_from_json(
            'good_eats.calculations.json', non_agg_cols=2, ex_len=2,
            group='risk_factor')

        self.assertEqual(dataset.aggregated_datasets_dict, {})

    def test_create_json_single(self):
        self.dataset_id = self._post_file()
        self.__test_create_from_json('good_eats_single.calculations.json')

    def test_create_multiple_with_group(self):
        self.dataset_id = self._post_file()
        groups = ['risk_factor', 'risk_factor,food_type', 'food_type']
        dataset = self.__test_create_from_json(
            'good_eats_group.calculations.json', non_agg_cols=2, ex_len=6)

        for group in groups:
            self.assertTrue(group in dataset.aggregated_datasets_dict.keys())
            dframe = dataset.aggregated_dataset(group).dframe()

            for column in Calculation().split_groups(group):
                self.assertTrue(column in dframe.columns)

    def test_create_with_missing_args(self):
        self.dataset_id = self._post_file()
        self.__test_error(self.controller.create(self.dataset_id))
        self.__test_error(
            self.controller.create(self.dataset_id, formula='gps_alt'))
        self.__test_error(
            self.controller.create(self.dataset_id, name='test'))

    def test_create_with_bad_json(self):
        self.dataset_id = self._post_file()
        json_filepath = self._fixture_path_prefix(
            'good_eats_bad.calculations.json')
        mock_uploaded_file = self._file_mock(json_filepath)

        self.__test_error(
            self.controller.create(self.dataset_id,
                                   json_file=mock_uploaded_file),
            error_text='Required')

        # Mock is now an empty file
        self.__test_error(
            self.controller.create(self.dataset_id,
                                   json_file=mock_uploaded_file),
            error_text='Improper format for JSON')

    def test_create_reserved_name(self):
        name = 'sum'
        response = json.loads(self.__post_formula(None, name))

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertEqual(response[Dataset.ID], self.dataset_id)

        dataset = Dataset.find_one(self.dataset_id)
        slug = dataset.schema.labels_to_slugs[name]
        response = json.loads(self.__post_formula('%s + amount' % slug))

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[Dataset.ID])

    def test_create_with_duplicate_names(self):
        formula_names_to_valid = {
            'water_not_functioning_none': True,   # an already slugged column
            'water_not_functioning/none': False,  # a non-slug column
            'region': False,    # an existing column
            'date': False,      # a reserved key and an existing column
            'sum': True,        # a reserved key
        }

        for formula_name, valid in formula_names_to_valid.items():
            dataset_id = self._post_file('water_points.csv')
            dframe_before = Dataset.find_one(dataset_id).dframe()

            # a calculation
            response = json.loads(self.controller.create(
                dataset_id,
                'water_source_type in ["borehole"]',
                formula_name))

            self.assertTrue(isinstance(response, dict))

            if valid:
                self.assertTrue(self.controller.SUCCESS in response)
            else:
                self.assertTrue(self.controller.ERROR in response)
                self.assertTrue(
                    formula_name in response[self.controller.ERROR])

            dataset = Dataset.find_one(dataset_id)

            if valid:
                name = dataset.calculations()[-1].name

            # an aggregation
            response = json.loads(self.controller.create(
                dataset_id,
                'newest(date_, water_functioning)',
                formula_name))

            self.assertTrue(isinstance(response, dict))
            self.assertTrue(self.controller.SUCCESS in response)

            dframe_after = dataset.dframe()

            # Does not change data
            self.assertEqual(len(dframe_before), len(dframe_after))

            if valid:
                slug = dataset.schema.labels_to_slugs[name]
                self.assertTrue(slug not in dframe_before.columns)
                self.assertTrue(slug in dframe_after.columns)

            if valid:
                # Does change columns
                self.assertEqual(
                    len(dframe_before.columns) + 1, len(dframe_after.columns))
            else:
                # Does not change columns
                self.assertEqual(
                    len(dframe_before.columns), len(dframe_after.columns))

            # check OK on update
            update = {
                'date': '2013-01-05',
                'water_source_type': 'borehole',
            }
            result = self.__post_update(dataset_id, update)
            self.assertTrue(Dataset.ID in result)
            dataset = Dataset.find_one(dataset_id)
            dframe_after_update = dataset.dframe()
            self.assertEqual(len(dframe_after) + 1, len(dframe_after_update))

    def test_cannot_create_aggregations_with_duplicate_names(self):
        dataset_id = self._post_file('water_points.csv')

        formula_name = 'name'

        response = json.loads(self.controller.create(
            dataset_id,
            'newest(date_, water_functioning)',
            formula_name))

        self.assertTrue(self.controller.SUCCESS in response)

        # another with the same name
        response = json.loads(self.controller.create(
            dataset_id,
            'newest(date_, water_functioning)',
            formula_name))

        self.assertTrue(formula_name in response[self.controller.ERROR])

    def test_can_create_aggregations_with_duplicate_as_slug_names(self):
        dataset_id = self._post_file('water_points.csv')

        formula_name = 'name*'

        response = json.loads(self.controller.create(
            dataset_id,
            'newest(date_, water_functioning)',
            formula_name))

        self.assertTrue(self.controller.SUCCESS in response)

        # another with the same name
        response = json.loads(self.controller.create(
            dataset_id,
            'newest(date_, water_functioning)',
            'name_'))

        self.assertTrue(self.controller.SUCCESS in response)

    def test_newest(self):
        expected_dataset = {
            u'wp_functional': {0: u'no', 1: u'yes', 2: u'no', 3: u'yes'},
            u'id': {0: 1, 1: 2, 2: 3, 3: 4}}
        dataset_id = self._post_file('newest_test.csv')
        self.controller.create(dataset_id,
                               'newest(submit_date,functional)',
                               'wp_functional', group='id')
        dataset = Dataset.find_one(dataset_id)
        agg_ds = dataset.aggregated_dataset('id')

        self.assertEqual(expected_dataset, agg_ds.dframe().to_dict())

    def test_update_after_agg(self):
        dataset_id = self._post_file('wp_data.csv')
        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,wp_id)', 'wp_newest'))

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        self.assertTrue(self.controller.SUCCESS in results)
        self.assertFalse(dataset.aggregated_dataset('') is None)

        update = {
            'submit_date': '2013-01-05',
            'wp_id': 'D',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)
        update = {
            'wp_id': 'E',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)

        dataset = Dataset.find_one(dataset_id)
        current_num_rows = dataset.num_rows
        agg_df = dataset.aggregated_dataset('').dframe()

        self.assertEqual(agg_df.get_value(0, 'wp_newest'), 'D')
        self.assertEqual(current_num_rows, previous_num_rows + 2)

    @requires_async
    def test_update_after_agg_group(self):
        dataset_id = self._post_file('wp_data.csv')
        group = 'wp_id'
        self._wait_for_dataset_state(dataset_id)

        test_calculations = {
            'newest(submit_date,functional)': 'wp_functional',
            'max(submit_date)': 'latest_submit_date',
            'ratio(functional in ["yes"], 1)': 'wp_func_ratio'}

        expected_results = {'wp_id': ['A', 'B', 'C', 'n/a'],
                            'wp_functional': ['yes', 'no', 'yes', 'yes'],
                            'wp_func_ratio': [1.0, 0.0, 1.0, 1.0],
                            'wp_func_ratio_denominator': [1, 1, 1, 1],
                            'wp_func_ratio_numerator': [1.0, 0.0, 1.0, 1.0],
                            'latest_submit_date': [1356998400, 1357084800,
                                                   1357171200, 1357257600]}

        expected_results_after = {
            'wp_id': ['A', 'B', 'C', 'D', 'n/a'],
            'wp_functional': ['no', 'no', 'yes', 'yes'],
            'wp_func_ratio': [0.5, 0.0, 1.0, 1.0, 1.0],
            'wp_func_ratio_denominator': [2.0, 1.0, 1.0, 1.0, 1.0],
            'wp_func_ratio_numerator': [1.0, 0.0, 1.0, 1.0, 1.0],
            'latest_submit_date': [1357603200.0, 1357084800.0,
                                   1357171200.0, 1357257600.0]}

        for formula, name in test_calculations.items():
            results = json.loads(self.controller.create(
                dataset_id, formula, name, group=group))

            self.assertTrue(self.controller.SUCCESS in results)

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        while True:
            dataset = Dataset.find_one(dataset_id)

            if dataset.aggregated_dataset(group) and all(
                    [not c.is_pending for c in dataset.calculations()]):
                break
            sleep(self.SLEEP_DELAY)

        agg_dframe = dataset.aggregated_dataset(group).dframe()
        self.assertEqual(set(expected_results.keys()),
                         set(agg_dframe.columns.tolist()))

        for column, results in expected_results.items():
            self.assertEqual(results,
                             agg_dframe[column].tolist())

        update = {
            'wp_id': 'D',
            'functional': 'yes',
        }
        self.__post_update(dataset_id, update)
        update = {
            'submit_date': '2013-01-08',
            'wp_id': 'A',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)

        while True:
            dataset = Dataset.find_one(dataset_id)
            current_num_rows = dataset.num_rows

            if not len(dataset.pending_updates):
                break

            sleep(self.SLEEP_DELAY)

        dataset = Dataset.find_one(dataset_id)
        agg_dframe = dataset.aggregated_dataset(group).dframe()

        self.assertEqual(current_num_rows, previous_num_rows + 2)
        self.assertEqual(set(expected_results_after.keys()),
                         set(agg_dframe.columns.tolist()))
        for column, results in expected_results_after.items():
            column = [x for x in agg_dframe[column].tolist() if not
                      is_float_nan(x)]
            self.assertEqual(results, column)

    @requires_async
    def test_fail_in_background(self):
        dataset_id = self._post_file('wp_data.csv')
        group = 'wp_id'
        self._wait_for_dataset_state(dataset_id)

        self.controller.create(dataset_id,
                               'newest(submit_date,functional)',
                               'wp_functional',
                               group=group)
        self.controller.create(dataset_id,
                               'max(submit_date)',
                               'latest_submit_date',
                               group=group)

        # Update the name to cause has pending to be true and infinite retries.
        # It will fail after 10 retries.
        calc = Calculation.find_one(dataset_id, 'latest_submit_date', group)
        calc.update({calc.NAME: 'another_name'})

        update = {
            'wp_id': 'D',
            'functional': 'yes',
        }
        self.__post_update(dataset_id, update)
        update = {
            'submit_date': '2013-01-08',
            'wp_id': 'A',
            'functional': 'no',
        }
        self.__post_update(dataset_id, update)

        while True:
            dataset = Dataset.find_one(dataset_id)
            calcs_not_pending = [
                c.state != c.STATE_PENDING for c in dataset.calculations()]

            if not len(dataset.pending_updates) and all(calcs_not_pending):
                break

            sleep(self.SLEEP_DELAY)

        for c in dataset.calculations():
            self.assertEqual(c.STATE_FAILED, c.state)
            self.assertTrue('Traceback' in c.error_message)

    def test_fail_then_create(self):
        response = json.loads(self.__post_formula())
        self.__verify_create(response)

        # Overwrite as failed
        calc = Calculation.find_one(self.dataset_id, self.name)
        calc.update({calc.STATE: calc.STATE_FAILED})

        # Test we can still add a calculation
        self.name = 'test2'
        response = json.loads(self.__post_formula())
        self.__verify_create(response)
 def setUp(self):
     TestBase.setUp(self)
     self.datasets = Datasets()
     self.tmp_file = NamedTemporaryFile(delete=False)
class TestProfile(TestBase):

    TEST_CASE_SIZES = {
        'tiny': (1, 1),
        'small': (2, 2),
        'large': (4, 40),
    }

    def setUp(self):
        TestBase.setUp(self)
        self.datasets = Datasets()
        self.tmp_file = NamedTemporaryFile(delete=False)

    def tearDown(self):
        os.unlink(self.tmp_file.name)

    def _expand_width(self, df, exponent):
        for i in xrange(0, exponent):
            other = df.rename(columns={
                col: '%s-%s' % (col, idx)
                for (idx, col) in enumerate(df.columns)
            })
            df = df.join(other)
            df.rename(columns={
                col: str(idx)
                for (idx, col) in enumerate(df.columns)
            },
                      inplace=True)
        return df

    def _grow_test_data(self, dataset_name, width_exp, length_factor):
        df = self.get_data(dataset_name)
        df = self._expand_width(df, width_exp)
        return concat([df] * length_factor)

    def test_tiny_profile(self):
        self._test_profile('tiny')

    def test_small_profile(self):
        self._test_profile('small')

    def test_large_profile(self):
        self._test_profile('large')

    @run_profiler
    def _test_profile(self, size):
        print 'bamboo/bamboo: %s' % size
        self._test_create_data(*self.TEST_CASE_SIZES[size])
        print 'saving dataset'
        self._test_save_dataset()
        self._test_get_info()
        self._test_get_summary()
        self._test_get_summary_with_group('province')
        self._test_get_summary_with_group('school_zone')

    def _test_create_data(self, width_exp, length_factor):
        self.data = self._grow_test_data('kenya_secondary_schools_2007.csv',
                                         width_exp, length_factor)
        print 'bamboo/bamboo rows: %s, columns: %s' % (len(
            self.data), len(self.data.columns))

    def _test_save_dataset(self):
        self.data.to_csv(self.tmp_file)
        self.tmp_file.close()
        mock_uploaded_file = MockUploadedFile(open(self.tmp_file.name, 'r'))
        result = json.loads(self.datasets.create(csv_file=mock_uploaded_file))
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

    def _test_get_info(self):
        result = json.loads(self.datasets.info(self.dataset_id))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary(self):
        result = json.loads(
            self.datasets.summary(self.dataset_id,
                                  select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary_with_group(self, group):
        result = json.loads(
            self.datasets.summary(self.dataset_id,
                                  group=group,
                                  select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))
Exemple #11
0
class TestCalculations(TestBase):

    def setUp(self):
        TestBase.setUp(self)
        self.controller = Calculations()
        self.dataset_controller = Datasets()
        self.dataset_id = None
        self.formula = 'amount + gps_alt'
        self.name = 'test'

    def _post_formula(self, formula=None, name=None):
        if not formula:
            formula = self.formula
        if not name:
            name = self.name

        if not self.dataset_id:
            self.dataset_id = self._post_file()

        return self.controller.create(self.dataset_id, formula, name)

    def _post_update(self, dataset_id, update):
        return json.loads(self.dataset_controller.update(
            dataset_id=dataset_id, update=json.dumps(update)))

    def _wait_for_calculation_ready(self, dataset_id, name):
        while True:
            calculation = Calculation.find_one(dataset_id, name)

            if calculation.is_ready:
                break

            sleep(self.SLEEP_DELAY)

    def _test_error(self, response, error_text=None):
        response = json.loads(response)

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.ERROR in response)

        if not error_text:
            error_text = 'Must provide'

        self.assertTrue(error_text in response[self.controller.ERROR])

    def _test_create_from_json(self, json_filename, non_agg_cols=1, group=None,
                               ex_len=1):
        json_filepath = 'tests/fixtures/%s' % json_filename
        mock_uploaded_file = self._file_mock(json_filepath)
        dataset = Dataset.find_one(self.dataset_id)
        prev_columns = len(dataset.dframe().columns)
        response = json.loads(self.controller.create(
            self.dataset_id, json_file=mock_uploaded_file, group=group))

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        self.assertEqual(
            ex_len, len(json.loads(self.controller.show(self.dataset_id))))
        self.assertEqual(
            prev_columns + non_agg_cols,
            len(dataset.reload().dframe().columns))

        return dataset

    def test_show(self):
        self._post_formula()
        response = self.controller.show(self.dataset_id)

        self.assertTrue(isinstance(json.loads(response), list))

    def test_create(self):
        response = json.loads(self._post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(TestAbstractDatasets.NUM_ROWS, len(dataset.dframe()))

    @requires_async
    def test_create_async_not_ready(self):
        self.dataset_id = create_dataset_from_url(
            '%s%s' % (self._local_fixture_prefix(), 'good_eats_huge.csv'),
            allow_local_file=True).dataset_id
        response = json.loads(self._post_formula())
        dataset = Dataset.find_one(self.dataset_id)

        self.assertFalse(dataset.is_ready)
        self.assertTrue(isinstance(response, dict))
        self.assertFalse(DATASET_ID in response)

        self._wait_for_dataset_state(self.dataset_id)

        self.assertFalse(self.name in dataset.schema.keys())

    @requires_async
    def test_create_async_sets_calculation_status(self):
        self.dataset_id = create_dataset_from_url(
            '%s%s' % (self._local_fixture_prefix(), 'good_eats_huge.csv'),
            allow_local_file=True).dataset_id

        self._wait_for_dataset_state(self.dataset_id)

        response = json.loads(self._post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        response = json.loads(self.controller.show(self.dataset_id))[0]

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(Calculation.STATE in response)
        self.assertEqual(response[Calculation.STATE],
                         Calculation.STATE_PENDING)

        self._wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.schema.keys())

    @requires_async
    def test_create_async(self):
        self.dataset_id = self._post_file()

        self._wait_for_dataset_state(self.dataset_id)

        response = json.loads(self._post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        self._wait_for_calculation_ready(self.dataset_id, self.name)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.schema.keys())

        dataset = Dataset.find_one(self.dataset_id)

        self.assertEqual(TestAbstractDatasets.NUM_ROWS, len(dataset.dframe()))

    def test_create_invalid_formula(self):
        dataset_id = self._post_file()
        result = json.loads(
            self.controller.create(dataset_id, '=NON_EXIST', self.name))

        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Datasets.ERROR in result.keys())

    def test_create_remove_summary(self):
        dataset_id = self._post_file()
        Datasets().summary(
            dataset_id,
            select=Datasets.SELECT_ALL_FOR_SUMMARY)
        dataset = Dataset.find_one(dataset_id)

        self.assertTrue(isinstance(dataset.stats, dict))
        self.assertTrue(isinstance(dataset.stats[Dataset.ALL], dict))

        self._post_formula()
        # stats should have new column for calculation
        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name in dataset.stats.get(Dataset.ALL).keys())

    def test_delete_nonexistent_calculation(self):
        dataset_id = self._post_file()
        result = json.loads(self.controller.delete(dataset_id, self.name))

        self.assertTrue(Calculations.ERROR in result)

    def test_delete(self):
        self._post_formula()
        result = json.loads(self.controller.delete(self.dataset_id, self.name))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue(self.name not in dataset.schema.labels_to_slugs)

    def test_show_jsonp(self):
        self._post_formula()
        results = self.controller.show(self.dataset_id, callback='jsonp')

        self.assertEqual('jsonp(', results[0:6])
        self.assertEqual(')', results[-1])

    def test_create_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        response = json.loads(self._post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        dataset = Dataset.find_one(self.dataset_id)

        self.assertTrue('' in dataset.aggregated_datasets_dict.keys())

    def test_delete_aggregation(self):
        self.formula = 'sum(amount)'
        self.name = 'test'
        response = json.loads(self._post_formula())
        result = json.loads(
            self.controller.delete(self.dataset_id, self.name, ''))

        self.assertTrue(AbstractController.SUCCESS in result)

        dataset = Dataset.find_one(self.dataset_id)
        agg_dataset = Dataset.find_one(dataset.aggregated_datasets_dict[''])

        self.assertTrue(self.name not in agg_dataset.schema.labels_to_slugs)

    def test_error_on_delete_calculation_with_dependency(self):
        self._post_formula()
        dep_name = self.name
        self.formula = dep_name
        self.name = 'test1'
        response = json.loads(self._post_formula())

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)

        result = json.loads(
            self.controller.delete(self.dataset_id, dep_name, ''))

        self.assertTrue(AbstractController.ERROR in result)
        self.assertTrue('depend' in result[AbstractController.ERROR])

    def test_create_multiple(self):
        self.dataset_id = self._post_file()
        self._test_create_from_json(
            'good_eats.calculations.json', non_agg_cols=2, ex_len=2)

    def test_create_multiple_ignore_group(self):
        self.dataset_id = self._post_file()
        dataset = self._test_create_from_json(
            'good_eats.calculations.json', non_agg_cols=2, ex_len=2,
            group='risk_factor')

        self.assertEqual(dataset.aggregated_datasets_dict, {})

    def test_create_json_single(self):
        self.dataset_id = self._post_file()
        self._test_create_from_json('good_eats_single.calculations.json')

    def test_create_multiple_with_group(self):
        self.dataset_id = self._post_file()
        group = 'risk_factor'
        dataset = self._test_create_from_json(
            'good_eats_group.calculations.json',
            non_agg_cols=2, ex_len=3, group=group)

        self.assertTrue(group in dataset.aggregated_datasets_dict.keys())

    def test_create_with_missing_args(self):
        self.dataset_id = self._post_file()
        self._test_error(self.controller.create(self.dataset_id))
        self._test_error(
            self.controller.create(self.dataset_id, formula='gps_alt'))
        self._test_error(
            self.controller.create(self.dataset_id, name='test'))

    def test_create_with_bad_json(self):
        self.dataset_id = self._post_file()
        json_filepath = self._fixture_path_prefix(
            'good_eats_bad.calculations.json')
        mock_uploaded_file = self._file_mock(json_filepath)

        self._test_error(
            self.controller.create(self.dataset_id,
                                   json_file=mock_uploaded_file),
            error_text='Required')

        # Mock is now an empty file
        self._test_error(
            self.controller.create(self.dataset_id,
                                   json_file=mock_uploaded_file),
            error_text='No JSON')

    def test_create_reserved_name(self):
        name = 'sum'
        response = json.loads(self._post_formula(None, name))

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

        dataset = Dataset.find_one(self.dataset_id)
        slug = dataset.schema.labels_to_slugs[name]
        response = json.loads(self._post_formula('%s + amount' % slug))

        self.assertTrue(isinstance(response, dict))
        self.assertTrue(self.controller.SUCCESS in response)
        self.assertTrue(self.dataset_id in response[self.controller.SUCCESS])

    def test_create_dataset_with_duplicate_column_names(self):
        formula_names = [
            'water_not_functioning_none',  # an already slugged column
            'water_not_functioning/none',  # a non-slug column
            'region',                # an existing column
            'sum',                   # a reserved key
            'date',                  # a reserved key and an existing column
        ]

        for formula_name in formula_names:
            dataset_id = self._post_file('water_points.csv')
            dframe_before = Dataset.find_one(dataset_id).dframe()

            # a calculation
            response = json.loads(self.controller.create(
                dataset_id,
                'water_source_type in ["borehole"]',
                formula_name))

            self.assertTrue(isinstance(response, dict))
            self.assertTrue(self.controller.SUCCESS in response)

            dataset = Dataset.find_one(dataset_id)
            name = dataset.calculations()[-1].name

            # an aggregation
            response = json.loads(self.controller.create(
                dataset_id,
                'newest(date_, water_functioning)',
                formula_name))

            self.assertTrue(isinstance(response, dict))
            self.assertTrue(self.controller.SUCCESS in response)

            dframe_after = dataset.dframe()
            slug = dataset.schema.labels_to_slugs[name]

            self.assertEqual(len(dframe_before), len(dframe_after))
            self.assertTrue(slug not in dframe_before.columns)
            self.assertTrue(slug in dframe_after.columns)
            self.assertEqual(
                len(dframe_before.columns) + 1, len(dframe_after.columns))

            # check OK on update
            update = {
                'date': '2013-01-05',
                'water_source_type': 'borehole',
            }
            result = self._post_update(dataset_id, update)
            dataset = Dataset.find_one(dataset_id)
            dframe_after_update = dataset.dframe()
            self.assertEqual(len(dframe_after) + 1, len(dframe_after_update))

    def test_newest(self):
        expected_dataset = {
            u'wp_functional': {0: u'no', 1: u'yes', 2: u'no', 3: u'yes'},
            u'id': {0: 1, 1: 2, 2: 3, 3: 4}}
        dataset_id = self._post_file('newest_test.csv')
        self.controller.create(dataset_id,
                               'newest(submit_date,functional)',
                               'wp_functional', group='id')
        dataset = Dataset.find_one(dataset_id)
        agg_ds = dataset.aggregated_datasets['id']

        self.assertEqual(expected_dataset, agg_ds.dframe().to_dict())

    def test_update_after_agg(self):
        dataset_id = self._post_file('wp_data.csv')
        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,wp_id)', 'wp_newest'))

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        self.assertTrue(self.controller.SUCCESS in results.keys())
        self.assertFalse(dataset.aggregated_datasets.get('') is None)

        update = {
            'submit_date': '2013-01-05',
            'wp_id': 'D',
            'functional': 'no',
        }
        self._post_update(dataset_id, update)
        update = {
            'wp_id': 'E',
            'functional': 'no',
        }
        self._post_update(dataset_id, update)

        dataset = Dataset.find_one(dataset_id)
        current_num_rows = dataset.num_rows

        self.assertEqual(
            dataset.aggregated_datasets[''].dframe().get_value(0, 'wp_newest'),
            'D')
        self.assertEqual(current_num_rows, previous_num_rows + 2)

    @requires_async
    def test_update_after_agg_group(self):
        dataset_id = self._post_file('wp_data.csv')
        group = 'wp_id'
        self._wait_for_dataset_state(dataset_id)

        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,functional)', 'wp_functional',
                             group=group))
        results = json.loads(self.controller.create(dataset_id,
                             'max(submit_date)', 'latest_submit_date',
                             group=group))

        dataset = Dataset.find_one(dataset_id)
        previous_num_rows = dataset.num_rows

        while True:
            dataset = Dataset.find_one(dataset_id)

            if dataset.aggregated_datasets.get(group) and all(
                    [c.is_ready for c in dataset.calculations()]):
                break
            sleep(1)

        agg_dframe = dataset.aggregated_datasets[group].dframe()
        self.assertEqual(
            set(['wp_id', 'wp_functional', 'latest_submit_date']),
            set(agg_dframe.columns.tolist()))

        self.assertTrue(self.controller.SUCCESS in results.keys())

        update = {
            'wp_id': 'D',
            'functional': 'yes',
        }
        self._post_update(dataset_id, update)
        update = {
            'submit_date': '2013-01-08',
            'wp_id': 'A',
            'functional': 'no',
        }
        self._post_update(dataset_id, update)

        while True:
            dataset = Dataset.find_one(dataset_id)
            current_num_rows = dataset.num_rows

            if not len(dataset.pending_updates):
                break

            sleep(1)

        dataset = Dataset.find_one(dataset_id)
        agg_dframe = dataset.aggregated_datasets[group].dframe()

        self.assertEqual(agg_dframe.get_value(0, 'wp_id'), 'A')
        self.assertEqual(current_num_rows, previous_num_rows + 2)
        self.assertEqual(set(agg_dframe[group]),
                         set(['A', 'B', 'C', 'D', 'n/a']))

    @requires_async
    def test_fail_in_background(self):
        dataset_id = self._post_file('wp_data.csv')
        group = 'wp_id'
        self._wait_for_dataset_state(dataset_id)

        results = json.loads(self.controller.create(dataset_id,
                             'newest(submit_date,functional)', 'wp_functional',
                             group=group))
        results = json.loads(self.controller.create(dataset_id,
                             'max(submit_date)', 'latest_submit_date',
                             group=group))

        # Update the name to cause has pending to be true and infinite retries.
        # It will fail after 10 retries.
        calc = Calculation.find_one(dataset_id, 'latest_submit_date', group)
        calc.update({calc.NAME: 'another_name'})

        update = {
            'wp_id': 'D',
            'functional': 'yes',
        }
        self._post_update(dataset_id, update)
        update = {
            'submit_date': '2013-01-08',
            'wp_id': 'A',
            'functional': 'no',
        }
        self._post_update(dataset_id, update)

        while True:
            dataset = Dataset.find_one(dataset_id)
            current_num_rows = dataset.num_rows
            calcs_not_pending = [
                c.state != c.STATE_PENDING for c in dataset.calculations()]

            if not len(dataset.pending_updates) and all(calcs_not_pending):
                break

            sleep(1)

        for c in dataset.calculations():
            self.assertEqual(c.STATE_FAILED, c.state)
            self.assertTrue('Traceback' in c.error_message)
Exemple #12
0
class TestAbstractDatasets(TestBase):

    NUM_COLS = 15
    NUM_ROWS = 19

    def setUp(self):
        TestBase.setUp(self)
        self.controller = Datasets()
        self._file_name = 'good_eats.csv'
        self._update_file_name = 'good_eats_update.json'
        self._update_check_file_path = '%sgood_eats_update_values.json' % (
            self.FIXTURE_PATH)
        self.default_formulae = [
            'amount',
            'amount + 1',
            'amount - 5',
        ]

    def _put_row_updates(self, dataset_id=None, file_name=None, validate=True):
        if not dataset_id:
            dataset_id = self.dataset_id

        if not file_name:
            file_name = self._update_file_name

        update = open('%s%s' % (self.FIXTURE_PATH, file_name), 'r').read()
        result = json.loads(self.controller.update(dataset_id=dataset_id,
                                                   update=update))

        if validate:
            self.assertTrue(isinstance(result, dict))
            self.assertTrue(Dataset.ID in result)

        # set up the (default) values to test against
        with open(self._update_check_file_path, 'r') as f:
            self._update_values = json.loads(f.read())

    def _load_schema(self):
        return json.loads(
            self.controller.info(self.dataset_id))[Dataset.SCHEMA]

    def _check_dframes_are_equal(self, dframe1, dframe2):
        rows1 = comparable(dframe1)
        rows2 = comparable(dframe2)

        self.__check_dframe_is_subset(rows1, rows2)
        self.__check_dframe_is_subset(rows2, rows1)

    def __check_dframe_is_subset(self, rows1, rows2):
        for row in rows1:
            self.assertTrue(row in rows2,
                            '\nrow:\n%s\n\nnot in rows2:\n%s' % (row, rows2))

    def _post_calculations(self, formulae=[], group=None):
        schema = self._load_schema()
        controller = Calculations()

        for idx, formula in enumerate(formulae):
            name = 'calc_%d' % idx if not schema or\
                formula in schema.keys() else formula

            controller.create(self.dataset_id, formula=formula, name=name,
                              group=group)

    def _test_summary_built(self, result):
        # check that summary is created
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

        results = self.controller.summary(
            self.dataset_id,
            select=self.controller.SELECT_ALL_FOR_SUMMARY)

        return self._test_summary_results(results)

    def _test_summary_results(self, results):
        results = json.loads(results)
        self.assertTrue(isinstance(results, dict))
        return results

    def _test_aggregations(self, groups=['']):
        results = json.loads(self.controller.aggregations(self.dataset_id))
        self.assertTrue(isinstance(results, dict))
        self.assertEqual(len(results.keys()), len(groups))
        self.assertEqual(results.keys(), groups)
        linked_dataset_id = results[groups[0]]
        self.assertTrue(isinstance(linked_dataset_id, basestring))

        # inspect linked dataset
        return json.loads(self.controller.show(linked_dataset_id))

    def _test_summary_no_group(self, results, dataset_id=None, group=None):
        if not dataset_id:
            dataset_id = self.dataset_id

        group = [group] if group else []
        result_keys = results.keys()

        # minus the column that we are grouping on
        self.assertEqual(len(result_keys), self.NUM_COLS - len(group))

        columns = [col for col in
                   self.get_data(self._file_name).columns.tolist()
                   if not col in [MONGO_ID] + group]

        dataset = Dataset.find_one(dataset_id)
        labels_to_slugs = dataset.schema.labels_to_slugs

        for col in columns:
            slug = labels_to_slugs[col]
            self.assertTrue(slug in result_keys,
                            'col (slug): %s in: %s' % (slug, result_keys))
            self.assertTrue(SUMMARY in results[slug].keys())
Exemple #13
0
 def setUp(self):
     TestBase.setUp(self)
     self.datasets = Datasets()
     self.tmp_file = NamedTemporaryFile(delete=False)
Exemple #14
0
class TestProfile(TestBase):

    TEST_CASE_SIZES = {
        'tiny': (1, 1),
        'small': (2, 2),
        'large': (4, 40),
    }

    def setUp(self):
        TestBase.setUp(self)
        self.datasets = Datasets()
        self.tmp_file = NamedTemporaryFile(delete=False)

    def tearDown(self):
        os.unlink(self.tmp_file.name)

    def _expand_width(self, df, exponent):
        for i in xrange(0, exponent):
            other = df.rename(
                columns={col: '%s-%s' % (col, idx) for (idx, col) in
                         enumerate(df.columns)})
            df = df.join(other)
            df.rename(columns={col: str(idx) for (idx, col) in
                      enumerate(df.columns)}, inplace=True)
        return df

    def _grow_test_data(self, dataset_name, width_exp, length_factor):
        df = self.get_data(dataset_name)
        df = self._expand_width(df, width_exp)
        return concat([df] * length_factor)

    def test_tiny_profile(self):
        self._test_profile('tiny')

    def test_small_profile(self):
        self._test_profile('small')

    def test_large_profile(self):
        self._test_profile('large')

    @run_profiler
    def _test_profile(self, size):
        print 'bamboo/bamboo: %s' % size
        self._test_create_data(*self.TEST_CASE_SIZES[size])
        print 'saving dataset'
        self._test_save_dataset()
        self._test_get_info()
        self._test_get_summary()
        self._test_get_summary_with_group('province')
        self._test_get_summary_with_group('school_zone')

    def _test_create_data(self, width_exp, length_factor):
        self.data = self._grow_test_data(
            'kenya_secondary_schools_2007.csv', width_exp, length_factor)
        print 'bamboo/bamboo rows: %s, columns: %s' % (
            len(self.data), len(self.data.columns))

    def _test_save_dataset(self):
        self.data.to_csv(self.tmp_file)
        self.tmp_file.close()
        mock_uploaded_file = MockUploadedFile(open(self.tmp_file.name, 'r'))
        result = json.loads(self.datasets.create(csv_file=mock_uploaded_file))
        self.assertTrue(isinstance(result, dict))
        self.assertTrue(Dataset.ID in result)
        self.dataset_id = result[Dataset.ID]

    def _test_get_info(self):
        result = json.loads(self.datasets.info(self.dataset_id))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary(self):
        result = json.loads(self.datasets.summary(
            self.dataset_id,
            select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))

    def _test_get_summary_with_group(self, group):
        result = json.loads(self.datasets.summary(
            self.dataset_id, group=group,
            select=self.datasets.SELECT_ALL_FOR_SUMMARY))
        self.assertTrue(isinstance(result, dict))