def save(cls, dframe, dataset): """ Convert *dframe* to mongo format, iterate through rows adding ids for *dataset*, insert in chuncks of size *DB_BATCH_SIZE*. """ # add metadata to file dataset_observation_id = dataset[DATASET_OBSERVATION_ID] rows = [] labels_to_slugs = build_labels_to_slugs(dataset) # if column name is not in map assume it is already slugified # (i.e. NOT a label) dframe.columns = [labels_to_slugs.get(column, column) for column in dframe.columns.tolist()] for row_index, row in dframe.iterrows(): row = row.to_dict() row[DATASET_OBSERVATION_ID] = dataset_observation_id rows.append(row) if len(rows) > DB_BATCH_SIZE: # insert data into collection cls.collection.insert(rows, safe=True) rows = [] if len(rows): cls.collection.insert(rows, safe=True)
def _test_summary_no_group(self, results): result_keys = results.keys() print result_keys print self.test_data[self._file_name].columns.tolist() self.assertEqual(len(result_keys), self.NUM_COLS) columns = [col for col in self.test_data[self._file_name].columns.tolist() if not col in MONGO_RESERVED_KEYS] dataset = Dataset.find_one(self.dataset_id) labels_to_slugs = build_labels_to_slugs(dataset) for col in columns: slug = labels_to_slugs[col] self.assertTrue(slug in result_keys, 'col (slug): %s in: %s' % (slug, result_keys)) self.assertTrue(SUMMARY in results[slug].keys())
def _test_calculator(self, delay=True): dframe = Observation.find(self.dataset, as_df=True) columns = dframe.columns.tolist() start_num_cols = len(columns) added_num_cols = 0 column_labels_to_slugs = build_labels_to_slugs(self.dataset) label_list, slugified_key_list = [list(ary) for ary in zip(*column_labels_to_slugs.items())] for idx, formula in enumerate(self.calculations): name = 'test-%s' % idx if delay: task = calculate_column.delay(self.dataset, dframe, formula, name) # test that task has completed self.assertTrue(task.ready()) self.assertTrue(task.successful()) else: task = calculate_column(self.dataset, dframe, formula, name) column_labels_to_slugs = build_labels_to_slugs(self.dataset) unslug_name = name name = column_labels_to_slugs[unslug_name] # test that updated dataframe persisted dframe = Observation.find(self.dataset, as_df=True) self.assertTrue(name in dframe.columns) # test new number of columns added_num_cols += 1 self.assertEqual(start_num_cols + added_num_cols, len(dframe.columns.tolist())) # test that the schema is up to date dataset = Dataset.find_one(self.dataset[DATASET_ID]) self.assertTrue(SCHEMA in dataset.keys()) self.assertTrue(isinstance(dataset[SCHEMA], dict)) schema = dataset[SCHEMA] # test slugified column names slugified_key_list.append(name) self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list)) # test column labels label_list.append(unslug_name) labels = [schema[col][LABEL] for col in schema.keys()] self.assertEqual(sorted(labels), sorted(label_list)) # test result of calculation formula = column_labels_to_slugs[formula] for idx, row in dframe.iterrows(): try: result = np.float64(row[name]) stored = np.float64(row[formula]) # np.nan != np.nan, continue if we have two nan values if np.isnan(result) and np.isnan(stored): continue msg = self._equal_msg(result, stored, formula) self.assertAlmostEqual(result, stored, self.places, msg) except ValueError: msg = self._equal_msg(row[name], row[formula], formula) self.assertEqual(row[name], row[formula], msg)