Exemple #1
0
    def save(cls, dframe, dataset):
        """
        Convert *dframe* to mongo format, iterate through rows adding ids for
        *dataset*, insert in chuncks of size *DB_BATCH_SIZE*.
        """
        # add metadata to file
        dataset_observation_id = dataset[DATASET_OBSERVATION_ID]
        rows = []

        labels_to_slugs = build_labels_to_slugs(dataset)

        # if column name is not in map assume it is already slugified
        # (i.e. NOT a label)
        dframe.columns = [labels_to_slugs.get(column, column) for column in
                dframe.columns.tolist()]

        for row_index, row in dframe.iterrows():
            row = row.to_dict()
            row[DATASET_OBSERVATION_ID] = dataset_observation_id
            rows.append(row)
            if len(rows) > DB_BATCH_SIZE:
                # insert data into collection
                cls.collection.insert(rows, safe=True)
                rows = []
        if len(rows):
            cls.collection.insert(rows, safe=True)
Exemple #2
0
 def _test_summary_no_group(self, results):
     result_keys = results.keys()
     print result_keys
     print self.test_data[self._file_name].columns.tolist()
     self.assertEqual(len(result_keys), self.NUM_COLS)
     columns = [col for col in
             self.test_data[self._file_name].columns.tolist()
             if not col in MONGO_RESERVED_KEYS]
     dataset = Dataset.find_one(self.dataset_id)
     labels_to_slugs = build_labels_to_slugs(dataset)
     for col in columns:
         slug = labels_to_slugs[col]
         self.assertTrue(slug in result_keys,
                 'col (slug): %s in: %s' % (slug, result_keys))
         self.assertTrue(SUMMARY in results[slug].keys())
Exemple #3
0
    def _test_calculator(self, delay=True):
        dframe = Observation.find(self.dataset, as_df=True)

        columns = dframe.columns.tolist()
        start_num_cols = len(columns)
        added_num_cols = 0

        column_labels_to_slugs = build_labels_to_slugs(self.dataset)
        label_list, slugified_key_list = [list(ary) for ary in
                zip(*column_labels_to_slugs.items())]

        for idx, formula in enumerate(self.calculations):
            name = 'test-%s' % idx
            if delay:
                task = calculate_column.delay(self.dataset, dframe,
                        formula, name)
                # test that task has completed
                self.assertTrue(task.ready())
                self.assertTrue(task.successful())
            else:
                task = calculate_column(self.dataset, dframe,
                        formula, name)

            column_labels_to_slugs = build_labels_to_slugs(self.dataset)

            unslug_name = name
            name = column_labels_to_slugs[unslug_name]

            # test that updated dataframe persisted
            dframe = Observation.find(self.dataset, as_df=True)
            self.assertTrue(name in dframe.columns)

            # test new number of columns
            added_num_cols += 1
            self.assertEqual(start_num_cols + added_num_cols,
                    len(dframe.columns.tolist()))

            # test that the schema is up to date
            dataset = Dataset.find_one(self.dataset[DATASET_ID])
            self.assertTrue(SCHEMA in dataset.keys())
            self.assertTrue(isinstance(dataset[SCHEMA], dict))
            schema = dataset[SCHEMA]

            # test slugified column names
            slugified_key_list.append(name)
            self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list))

            # test column labels
            label_list.append(unslug_name)
            labels = [schema[col][LABEL] for col in schema.keys()]
            self.assertEqual(sorted(labels), sorted(label_list))

            # test result of calculation
            formula = column_labels_to_slugs[formula]

            for idx, row in dframe.iterrows():
                try:
                    result = np.float64(row[name])
                    stored = np.float64(row[formula])
                    # np.nan != np.nan, continue if we have two nan values
                    if np.isnan(result) and np.isnan(stored):
                        continue
                    msg = self._equal_msg(result, stored, formula)
                    self.assertAlmostEqual(result, stored, self.places, msg)
                except ValueError:
                    msg = self._equal_msg(row[name], row[formula], formula)
                    self.assertEqual(row[name], row[formula], msg)