Ejemplo n.º 1
0
 def test_delete(self):
     self._save_observations()
     records = [x for x in Observation.find(self.dataset)]
     self.assertNotEqual(records, [])
     Observation.delete(self.dataset)
     records = [x for x in Observation.find(self.dataset)]
     self.assertEqual(records, [])
Ejemplo n.º 2
0
 def test_find_as_df(self):
     self._save_observations()
     records = [x for x in Observation.find(self.dataset)]
     dframe = Observation.find(self.dataset, as_df=True)
     self.assertTrue(isinstance(dframe, DataFrame))
     self.assertEqual(self.test_data['good_eats.csv'].reindex(
                 columns=dframe.columns), dframe)
     columns = dframe.columns
     for key in MONGO_RESERVED_KEYS:
         self.assertFalse(prefix_reserved_key(key) in columns)
Ejemplo n.º 3
0
 def test_find_with_select_and_query(self):
     self._save_observations()
     cursor = Observation.find(self.dataset, '{"rating": "delectible"}',
             '{"rating": 1}')
     self.assertTrue(isinstance(cursor, Cursor))
     results = [row for row in cursor]
     self.assertEquals(sorted(results[0].keys()), ['_id', 'rating'])
Ejemplo n.º 4
0
 def _save_records(self):
     records = Observation.save(self.test_data['good_eats.csv'],
             self.dataset)
     cursor = Observation.find(self.dataset)
     records = [x for x in cursor]
     self.assertTrue(isinstance(records, list))
     self.assertTrue(isinstance(records[0], dict))
     self.assertTrue('_id' in records[0].keys())
     return records
Ejemplo n.º 5
0
    def save(cls, dataset, formula, name, **kwargs):
        """
        Attempt to parse formula, then save formula, and add a task to calculate
        formula.
        """

        dframe = Observation.find(dataset, as_df=True)

        # attempt to get a row from the dataframe
        try:
            row = dframe.irow(0)
        except IndexError, err:
            row = {}
Ejemplo n.º 6
0
    def GET(self, dataset_id, mode=False, query='{}', select=None,
            group=ALL):
        """
        Return data set for hash *dataset_id*.
        Execute query *query* in mongo if passed.
        If summary is passed return summary statistics for data set.
        If group is passed group the summary, if summary is false group is
        ignored.
        """
        dataset = Dataset.find_one(dataset_id)
        result = None

        try:
            if dataset:
                if mode == MODE_INFO:
                    result = Dataset.schema(dataset)
                elif mode == MODE_SUMMARY:
                    result = summarize(dataset, query, select, group)
                else:
                    return mongo_to_json(Observation.find(dataset, query,
                                select))
        except JSONError, e:
            result = {ERROR: e.__str__()}
Ejemplo n.º 7
0
def summarize(dataset, query, select, group):
    """
    Return a summary for the rows/values filtered by *query* and *select*
    and grouped by *group* or the overall summary if no group is specified.
    """
    # narrow list of observations via query/select
    dframe = Observation.find(dataset, query, select, as_df=True)

    # do not allow group by numeric types
    # TODO check schema for valid groupby columns once included
    _type = dframe.dtypes.get(group)
    if group != ALL and (_type is None or _type.type != np.object_):
        return {ERROR: "group: '%s' is not categorical." % group}

    # check cached stats for group and update as necessary
    stats = dataset.get(STATS, {})
    if not stats.get(group):
        stats = {ALL: summarize_df(dframe)} if group == ALL \
                else summarize_with_groups(dframe, stats, group)
        Dataset.update(dataset, {STATS: stats})
    stats_to_return = stats.get(group)

    return dict_from_mongo(stats_to_return if group == ALL else {group:
            stats_to_return})
Ejemplo n.º 8
0
    def _test_calculator(self, delay=True):
        dframe = Observation.find(self.dataset, as_df=True)

        columns = dframe.columns.tolist()
        start_num_cols = len(columns)
        added_num_cols = 0

        column_labels_to_slugs = build_labels_to_slugs(self.dataset)
        label_list, slugified_key_list = [list(ary) for ary in
                zip(*column_labels_to_slugs.items())]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx
            if delay:
                task = calculate_column.delay(self.dataset, dframe,
                        formula, name)
                # test that task has completed
                self.assertTrue(task.ready())
                self.assertTrue(task.successful())
            else:
                task = calculate_column(self.dataset, dframe,
                        formula, name)

            column_labels_to_slugs = build_labels_to_slugs(self.dataset)

            unslug_name = name
            name = column_labels_to_slugs[unslug_name]

            # test that updated dataframe persisted
            dframe = Observation.find(self.dataset, as_df=True)
            self.assertTrue(name in dframe.columns)

            # test new number of columns
            added_num_cols += 1
            self.assertEqual(start_num_cols + added_num_cols,
                    len(dframe.columns.tolist()))

            # test that the schema is up to date
            dataset = Dataset.find_one(self.dataset[DATASET_ID])
            self.assertTrue(SCHEMA in dataset.keys())
            self.assertTrue(isinstance(dataset[SCHEMA], dict))
            schema = dataset[SCHEMA]

            # test slugified column names
            slugified_key_list.append(name)
            self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list))

            # test column labels
            label_list.append(unslug_name)
            labels = [schema[col][LABEL] for col in schema.keys()]
            self.assertEqual(sorted(labels), sorted(label_list))

            # test result of calculation
            formula = column_labels_to_slugs[formula]

            for idx, row in dframe.iterrows():
                try:
                    result = np.float64(row[name])
                    stored = np.float64(row[formula])
                    # np.nan != np.nan, continue if we have two nan values
                    if np.isnan(result) and np.isnan(stored):
                        continue
                    msg = self._equal_msg(result, stored, formula)
                    self.assertAlmostEqual(result, stored, self.places, msg)
                except ValueError:
                    msg = self._equal_msg(row[name], row[formula], formula)
                    self.assertEqual(row[name], row[formula], msg)
Ejemplo n.º 9
0
 def test_find_with_query(self):
     self._save_observations()
     cursor = Observation.find(self.dataset, '{"rating": "delectible"}')
     self.assertTrue(isinstance(cursor, Cursor))
Ejemplo n.º 10
0
 def test_find(self):
     self._save_observations()
     cursor = Observation.find(self.dataset)
     self.assertTrue(isinstance(cursor, Cursor))
Ejemplo n.º 11
0
 def test_save_over_bulk(self):
     Observation.save(self.test_data['good_eats_large.csv'],
             self.dataset)
     cursor = Observation.find(self.dataset)
     records = [x for x in cursor]
     self.assertEqual(len(records), 1001)