def profiling_column(self, variable_metadata: VariableMetadata, column: pd.Series) -> VariableMetadata: """Profiling single column for necessary fields of metadata, if data is present . Args: variable_metadata: the original VariableMetadata instance. column: the column to profile. Returns: profiled VariableMetadata instance """ if not variable_metadata.name: variable_metadata.name = column.name if not variable_metadata.description: variable_metadata.description = self.profiler.construct_variable_description( column) if variable_metadata.named_entity is None: variable_metadata.named_entity = self.profiler.profile_named_entity( column) if variable_metadata.temporal_coverage: if variable_metadata.temporal_coverage[ 'start'] or not variable_metadata.temporal_coverage['end']: variable_metadata.temporal_coverage = self.profiler.profile_temporal_coverage( variable_metadata.temporal_coverage, column) return variable_metadata
def setUp(self): self.variable_1 = copy.deepcopy(sample_variable_1) self.variable_2 = copy.deepcopy(sample_variable_2) self.metadata_1 = VariableMetadata(description=self.variable_1, datamart_id=0) self.metadata_2 = VariableMetadata(description=self.variable_2, datamart_id=10)
def construct_variable_metadata( self, description: dict, global_datamart_id: int, col_offset: int, data: pd.DataFrame = None) -> VariableMetadata: """Construct variable metadata. Args: description: description dict. global_datamart_id: integer of datamart id. col_offset: integer, the column index. data: dataframe of data. Returns: VariableMetadata instance """ variable_metadata = VariableMetadata.construct_variable( description, datamart_id=col_offset + global_datamart_id + 1) if data is not None: variable_metadata = self._profiling_column( description, variable_metadata, data.iloc[:, col_offset]) return variable_metadata
def test_add_variable(self): self.assertEqual(len(self.metadata.variables), 0) for col_offset, variable_description in enumerate( self.global_metadata_description["variables"]): variable_metadata = VariableMetadata(variable_description, datamart_id=col_offset + 1) self.metadata.add_variable_metadata(variable_metadata) self.assertEqual(len(self.metadata.variables), len(sample_global_metadata_description["variables"])) self.assertEqual(self.metadata.value, gt["metadata"])
def test_add_variable(self): print("[Test]{}/test_add_variable".format(self.__class__.__name__)) self.assertEqual(len(self.metadata.variables), 0) for col_offset, variable_description in enumerate( self.global_metadata_description["variables"]): variable_metadata = VariableMetadata(variable_description, datamart_id=col_offset + 1) self.metadata.add_variable_metadata(variable_metadata) self.assertEqual(len(self.metadata.variables), len(sample_global_metadata_description["variables"])) self.assertEqual(self.metadata.value, gt["metadata"]) print(colored('.Done', 'red'))
def basic_profiling_column(cls, description: dict, variable_metadata: VariableMetadata, column: pd.Series) -> VariableMetadata: """Profiling single column for necessary fields of metadata, if data is present . Args: description: description dict about the column. variable_metadata: the original VariableMetadata instance. column: the column to profile. Returns: profiled VariableMetadata instance """ if not variable_metadata.name: variable_metadata.name = str(column.name) if not variable_metadata.description: variable_metadata.description = cls.construct_variable_description( column) if variable_metadata.named_entity is None: variable_metadata.named_entity = cls.profile_named_entity(column) elif variable_metadata.named_entity is False and not description: if cls.named_entity_column_recognize(column): variable_metadata.named_entity = cls.profile_named_entity( column) if variable_metadata.temporal_coverage is not False: if not variable_metadata.temporal_coverage[ 'start'] or not variable_metadata.temporal_coverage['end']: variable_metadata.temporal_coverage = cls.profile_temporal_coverage( column=column, coverage=variable_metadata.temporal_coverage) elif not description: temporal_coverage = cls.profile_temporal_coverage(column=column) if temporal_coverage: variable_metadata.temporal_coverage = temporal_coverage if not variable_metadata.semantic_type: variable_metadata.semantic_type = cls.profile_semantic_type(column) return variable_metadata