def test_delete(self): self._save_observations() records = [x for x in Observation.find(self.dataset)] self.assertNotEqual(records, []) Observation.delete(self.dataset) records = [x for x in Observation.find(self.dataset)] self.assertEqual(records, [])
def test_find_as_df(self): self._save_observations() records = [x for x in Observation.find(self.dataset)] dframe = Observation.find(self.dataset, as_df=True) self.assertTrue(isinstance(dframe, DataFrame)) self.assertEqual(self.test_data['good_eats.csv'].reindex( columns=dframe.columns), dframe) columns = dframe.columns for key in MONGO_RESERVED_KEYS: self.assertFalse(prefix_reserved_key(key) in columns)
def test_find_with_select_and_query(self): self._save_observations() cursor = Observation.find(self.dataset, '{"rating": "delectible"}', '{"rating": 1}') self.assertTrue(isinstance(cursor, Cursor)) results = [row for row in cursor] self.assertEquals(sorted(results[0].keys()), ['_id', 'rating'])
def _save_records(self): records = Observation.save(self.test_data['good_eats.csv'], self.dataset) cursor = Observation.find(self.dataset) records = [x for x in cursor] self.assertTrue(isinstance(records, list)) self.assertTrue(isinstance(records[0], dict)) self.assertTrue('_id' in records[0].keys()) return records
def save(cls, dataset, formula, name, **kwargs): """ Attempt to parse formula, then save formula, and add a task to calculate formula. """ dframe = Observation.find(dataset, as_df=True) # attempt to get a row from the dataframe try: row = dframe.irow(0) except IndexError, err: row = {}
def GET(self, dataset_id, mode=False, query='{}', select=None, group=ALL): """ Return data set for hash *dataset_id*. Execute query *query* in mongo if passed. If summary is passed return summary statistics for data set. If group is passed group the summary, if summary is false group is ignored. """ dataset = Dataset.find_one(dataset_id) result = None try: if dataset: if mode == MODE_INFO: result = Dataset.schema(dataset) elif mode == MODE_SUMMARY: result = summarize(dataset, query, select, group) else: return mongo_to_json(Observation.find(dataset, query, select)) except JSONError, e: result = {ERROR: e.__str__()}
def summarize(dataset, query, select, group): """ Return a summary for the rows/values filtered by *query* and *select* and grouped by *group* or the overall summary if no group is specified. """ # narrow list of observations via query/select dframe = Observation.find(dataset, query, select, as_df=True) # do not allow group by numeric types # TODO check schema for valid groupby columns once included _type = dframe.dtypes.get(group) if group != ALL and (_type is None or _type.type != np.object_): return {ERROR: "group: '%s' is not categorical." % group} # check cached stats for group and update as necessary stats = dataset.get(STATS, {}) if not stats.get(group): stats = {ALL: summarize_df(dframe)} if group == ALL \ else summarize_with_groups(dframe, stats, group) Dataset.update(dataset, {STATS: stats}) stats_to_return = stats.get(group) return dict_from_mongo(stats_to_return if group == ALL else {group: stats_to_return})
def _test_calculator(self, delay=True): dframe = Observation.find(self.dataset, as_df=True) columns = dframe.columns.tolist() start_num_cols = len(columns) added_num_cols = 0 column_labels_to_slugs = build_labels_to_slugs(self.dataset) label_list, slugified_key_list = [list(ary) for ary in zip(*column_labels_to_slugs.items())] for idx, formula in enumerate(self.calculations): name = 'test-%s' % idx if delay: task = calculate_column.delay(self.dataset, dframe, formula, name) # test that task has completed self.assertTrue(task.ready()) self.assertTrue(task.successful()) else: task = calculate_column(self.dataset, dframe, formula, name) column_labels_to_slugs = build_labels_to_slugs(self.dataset) unslug_name = name name = column_labels_to_slugs[unslug_name] # test that updated dataframe persisted dframe = Observation.find(self.dataset, as_df=True) self.assertTrue(name in dframe.columns) # test new number of columns added_num_cols += 1 self.assertEqual(start_num_cols + added_num_cols, len(dframe.columns.tolist())) # test that the schema is up to date dataset = Dataset.find_one(self.dataset[DATASET_ID]) self.assertTrue(SCHEMA in dataset.keys()) self.assertTrue(isinstance(dataset[SCHEMA], dict)) schema = dataset[SCHEMA] # test slugified column names slugified_key_list.append(name) self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list)) # test column labels label_list.append(unslug_name) labels = [schema[col][LABEL] for col in schema.keys()] self.assertEqual(sorted(labels), sorted(label_list)) # test result of calculation formula = column_labels_to_slugs[formula] for idx, row in dframe.iterrows(): try: result = np.float64(row[name]) stored = np.float64(row[formula]) # np.nan != np.nan, continue if we have two nan values if np.isnan(result) and np.isnan(stored): continue msg = self._equal_msg(result, stored, formula) self.assertAlmostEqual(result, stored, self.places, msg) except ValueError: msg = self._equal_msg(row[name], row[formula], formula) self.assertEqual(row[name], row[formula], msg)
def test_find_with_query(self): self._save_observations() cursor = Observation.find(self.dataset, '{"rating": "delectible"}') self.assertTrue(isinstance(cursor, Cursor))
def test_find(self): self._save_observations() cursor = Observation.find(self.dataset) self.assertTrue(isinstance(cursor, Cursor))
def test_save_over_bulk(self): Observation.save(self.test_data['good_eats_large.csv'], self.dataset) cursor = Observation.find(self.dataset) records = [x for x in cursor] self.assertEqual(len(records), 1001)