def test_build_schema(self): illegal_col_regex = re.compile(r'\W') for dataset_name in self.TEST_DATASETS: dataset = Dataset() dataset.save(self.test_dataset_ids[dataset_name]) dataset.build_schema(self.get_data(dataset_name)) # get dataset with new schema dataset = Dataset.find_one(self.test_dataset_ids[dataset_name]) for key in [ Dataset.CREATED_AT, Dataset.SCHEMA, Dataset.UPDATED_AT]: self.assertTrue(key in dataset.record.keys()) df_columns = self.get_data(dataset_name).columns.tolist() seen_columns = [] for column_name, column_attributes in dataset.schema.items(): # check column_name is unique self.assertFalse(column_name in seen_columns) seen_columns.append(column_name) # check column name is only legal chars self.assertFalse(illegal_col_regex.search(column_name)) # check has require attributes self.assertTrue(SIMPLETYPE in column_attributes) self.assertTrue(OLAP_TYPE in column_attributes) self.assertTrue(Dataset.LABEL in column_attributes) # check label is an original column self.assertTrue(column_attributes[Dataset.LABEL] in df_columns) df_columns.remove(column_attributes[Dataset.LABEL]) # check not reserved key self.assertFalse(column_name in MONGO_RESERVED_KEY_STRS) # ensure all columns in df_columns have store columns self.assertTrue(len(df_columns) == 0)