def __init__(self, iterable: typing.Iterable = (), metadata: typing.Dict[str, typing.Any] = None, *, generate_metadata: bool = False, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> None: if isinstance(iterable, pandas.DataFrame): super().__init__(type(self)(row) for row in iterable.itertuples(index=False, name=None)) else: if isinstance(iterable, numpy.matrix): # One cannot iterate over a matrix segment by segment. You always get back # a matrix (2D structure) and not an array of rows or columns. By converting # it to an array such iteration segment by segment works. iterable = numpy.array(iterable) super().__init__(iterable) from d3m import types if isinstance(iterable, types.Container): if isinstance(iterable, List): # We made a copy, so we do not have to generate metadata. self.metadata: metadata_base.DataMetadata = iterable.metadata else: self.metadata: metadata_base.DataMetadata = iterable.metadata if generate_metadata: self.metadata = self.metadata.generate(self) if metadata is not None: self.metadata: metadata_base.DataMetadata = self.metadata.update((), metadata) else: self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata(metadata) if generate_metadata: self.metadata = self.metadata.generate(self)
def _update_metadata( cls, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment, ) -> metadata_base.DataMetadata: resource_metadata = dict(metadata.query((resource_id, ))) if "structural_type" not in resource_metadata or not issubclass( resource_metadata["structural_type"], container.DataFrame): raise TypeError( 'The Dataset resource is not a DataFrame, but "{type}".'. format(type=resource_metadata.get("structural_type", None), )) resource_metadata.update( { "schema": metadata_base.CONTAINER_SCHEMA_VERSION, }, ) new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = metadata.copy_to(new_metadata, (resource_id, )) # Resource is not anymore an entry point. new_metadata = new_metadata.remove_semantic_type( (), "https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint") return new_metadata
def _update_metadata( self, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment ) -> metadata_base.DataMetadata: resource_metadata = dict(metadata.query((resource_id, ))) if 'structural_type' not in resource_metadata or not issubclass( resource_metadata['structural_type'], container.DataFrame): raise TypeError( "The Dataset resource is not a DataFrame, but \"{type}\".". format(type=resource_metadata.get('structural_type', None), )) resource_metadata.update( { 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, }, ) new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = metadata.copy_to(new_metadata, (resource_id, )) # Resource is not anymore an entry point. new_metadata = new_metadata.remove_semantic_type( (), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') return new_metadata
def __finalize__(self: D, other: typing.Any, method: str = None, **kwargs: typing.Any) -> D: self = super().__finalize__(other, method, **kwargs) # Merge operation: using metadata of the left object. if method == 'merge': obj = other.left # Concat operation: using metadata of the first object. elif method == 'concat': obj = other.objs[0] else: obj = other if isinstance(obj, DataFrame): # TODO: We could adapt (if this is after a slice) metadata instead of just copying? self.metadata: metadata_base.DataMetadata = obj.metadata # "metadata" attribute should already be set in "__init__", # but if we got here without it, let's set it now. elif not hasattr(self, 'metadata'): self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata( ) return self
def __new__( cls: typing.Type[N], input_array: typing.Sequence, metadata: typing.Dict[str, typing.Any] = None, *, dtype: typing.Union[numpy.dtype, str] = None, order: typing.Any = None, generate_metadata: bool = False, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None, ) -> N: array = numpy.asarray(input_array, dtype=dtype, order=order).view(cls) # Importing here to prevent import cycle. from d3m import types if isinstance(input_array, types.Container): if isinstance(input_array, ndarray): # We made a copy, so we do not have to generate metadata. array.metadata = input_array.metadata else: array.metadata = input_array.metadata if generate_metadata: array.metadata = array.metadata.generate(array) if metadata is not None: array.metadata = array.metadata.update((), metadata) else: array.metadata = metadata_base.DataMetadata(metadata) if generate_metadata: array.metadata = array.metadata.generate(array) return array
def _produce(self, inputs: DatasetSplitInputs, is_train: bool) -> base.CallResult[DatasetSplitOutputs]: """ This function splits the fitted Dataset. Parameters ---------- inputs: A list of 0-based indices which specify which splits to be used as test split in output. is_train: Whether we are producing train or test data. Returns ------- Returns a list of Datasets. """ if not self._fitted or self._splits is None or self._dataset is None or self._main_resource_id is None or self._graph is None: raise exceptions.PrimitiveNotFittedError("Primitive not fitted.") output_datasets = container.List(generate_metadata=True) for index in inputs: train_indices, test_indices = self._splits[index] if is_train: output_dataset = base_utils.sample_rows( self._dataset, self._main_resource_id, set(train_indices), self._graph, delete_recursive=self.hyperparams.get('delete_recursive', False), ) else: output_dataset = base_utils.sample_rows( self._dataset, self._main_resource_id, set(test_indices), self._graph, delete_recursive=self.hyperparams.get('delete_recursive', False), ) output_datasets.append(output_dataset) output_datasets.metadata = metadata_base.DataMetadata({ 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': container.List, 'dimension': { 'length': len(output_datasets), }, }) # We update metadata based on metadata of each dataset. # TODO: In the future this might be done automatically by generate_metadata. # See: https://gitlab.com/datadrivendiscovery/d3m/issues/119 for index, dataset in enumerate(output_datasets): output_datasets.metadata = dataset.metadata.copy_to(output_datasets.metadata, (), (index,)) return base.CallResult(output_datasets)
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate(target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) return outputs_metadata
def _update_predictions_metadata(self, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: outputs_metadata = metadata_base.DataMetadata() if outputs is not None: outputs_metadata = outputs_metadata.generate(outputs) for column_index, column_metadata in enumerate(target_columns_metadata): outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) return outputs_metadata
def __array_finalize__(self, obj: typing.Any) -> None: # If metadata attribute already exists. if hasattr(self, 'metadata'): return if obj is not None and isinstance(obj, ndarray) and hasattr(obj, 'metadata'): # TODO: We could adapt (if this is after a slice) metadata instead of just copying? self.metadata: metadata_base.DataMetadata = obj.metadata else: self.metadata = metadata_base.DataMetadata()
def get_dataframe(dataset: container.Dataset, resource_id: str) -> container.DataFrame: # extracts a dataframe from a dataset and ensures its metadata is transferred over # grab the resource and its metadata out of the dataset dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id) resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,))) # copy the resource metadata from the dataset into the resource new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,)) new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') dataframe.metadata = new_metadata return dataframe
def __init__(self, data: Data = None, metadata: typing.Dict[str, typing.Any] = None, index: typing.Union[pandas.Index, Data] = None, columns: typing.Union[pandas.Index, Data] = None, dtype: typing.Union[numpy.dtype, str, pandas_common.ExtensionDtype] = None, copy: bool = False, *, generate_metadata: bool = False, check: bool = True, source: typing.Any = None, timestamp: datetime.datetime = None) -> None: # If not a constructor call to this exact class, then a child constructor # is responsible to call a pandas constructor. if type(self) is DataFrame: pandas.DataFrame.__init__(self, data=convert_ndarray( convert_lists(data)), index=index, columns=columns, dtype=dtype, copy=copy) # Importing here to prevent import cycle. from d3m import types if isinstance(data, types.Container): # type: ignore if isinstance(data, DataFrame): # We made a copy, so we do not have to generate metadata. self.metadata: metadata_base.DataMetadata = data.metadata else: self.metadata: metadata_base.DataMetadata = data.metadata if generate_metadata: self.metadata = self.metadata.generate(self) if metadata is not None: self.metadata: metadata_base.DataMetadata = self.metadata.update( (), metadata) else: self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata( metadata) if generate_metadata: self.metadata = self.metadata.generate(self)
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: """ Updata metadata for selected columns. Args: inputs_metadata: metadata_base.DataMetadata outputs: Container Dataframe target_columns_metadata: list Returns: d3m.metadata.base.DataMetadata """ outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) for column_index, column_metadata in enumerate(target_columns_metadata): column_metadata.pop("structural_type", None) outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) return outputs_metadata
def get_dataframe(dataset: container.Dataset, resource_id: str, target_col: int) -> container.DataFrame: """ extracts a dataframe from a dataset and ensures its metadata is transferred over """ # grab the resource and its metadata out of the dataset dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id) resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,))) # copy the resource metadata from the dataset into the resource new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,)) new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') # add target metadata to specified column new_metadata = new_metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, target_col), 'https://metadata.datadrivendiscovery.org/types/TrueTarget' ) dataframe.metadata = new_metadata return dataframe
def get_dataset(input_data, target_index=-2, index_column=-1, semantic_types=None, parse=False, media_dir=None): """ A function that has as input a dataframe, and generates a D3M dataset. Parameters ---------- input_data : pd.DataFrame The dataframe to be converted to d3m Dataset. target_index : int The index of the target, if index is not present, it will be ignored. index_column : int The index of the index target, if not provided it will look for d3m index, if not generate one. semantic_types : Sequence[Sequence[str]] A list of semantic types to be applied. The sequence must be of the same length of the dataframe columns. parse : A flag to determine if the dataset will contain parsed columns. By default is set to fault to make it compatible with most of D3M current infrastructure. media_dir : str The absolute path of the directory containing the image/video/csv files, if not present, it will be ignored Returns ------- A D3M dataset. """ data = make_unique_columns(input_data.copy(deep=True)) if semantic_types is None: semantic_types = [[] for i in range(len(data.columns))] for i, _type in enumerate(input_data.dtypes): if _type == float: semantic_types[i].append('http://schema.org/Float') elif _type == int: semantic_types[i].append('http://schema.org/Integer') resources = {} if 'd3mIndex' in data.columns: index_column = list(data.columns).index("d3mIndex") else: if index_column == -1: data.insert(0, 'd3mIndex', range(len(data))) semantic_types.insert(0, []) target_index += 1 index_column = 0 data = container_pandas.DataFrame(data) # remove this if not parse: data = data.astype(str) metadata = metadata_base.DataMetadata() resources['learningData'] = data metadata = metadata.update(('learningData',), { 'structural_type': type(data), 'semantic_types': [ 'https://metadata.datadrivendiscovery.org/types/Table', 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint', ], 'dimension': { 'name': 'rows', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'], 'length': len(data), }, }) metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), { 'dimension': { 'name': 'columns', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'], 'length': len(data.columns), }, }) for i, column_name in enumerate(data.columns): if i == index_column: metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), { 'name': column_name, 'structural_type': numpy.int64, 'semantic_types': [ 'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey', ], }) else: _structural_type = str if semantic_types[i]: _semantic_types = semantic_types[i] if 'http://schema.org/Float' in _semantic_types: _structural_type = numpy.float64 elif 'http://schema.org/Integer' in _semantic_types: _structural_type = numpy.int64 else: _semantic_types = ['https://metadata.datadrivendiscovery.org/types/UnknownType'] if not parse: _structural_type = str if i == target_index: _semantic_types += ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget'] else: _semantic_types += ['https://metadata.datadrivendiscovery.org/types/Attribute'] # Add media dir if any if media_dir is not None and i != target_index: # Check the type of the first path first_file_path = data.iloc[0, i] suffix = first_file_path.split('.')[-1] if suffix in ['png', 'jpg']: media_type = 'image' elif suffix in ['mp4', 'avi']: media_type = 'video' else: media_type = 'text' _semantic_types += ["https://metadata.datadrivendiscovery.org/types/FileName"] metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), { 'name': column_name, 'structural_type': str, 'semantic_types': _semantic_types, "location_base_uris": [pathlib.Path(media_dir).as_uri()+'/'], "media_types": [ media_type+"/"+suffix ], }) else: metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), { 'name': column_name, 'structural_type': _structural_type, 'semantic_types': _semantic_types, }) dataset_id = str(uuid.uuid4()) dataset_metadata = { 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, 'structural_type': Dataset, 'id': dataset_id, 'name': dataset_id, 'digest': str(uuid.uuid4()), 'dimension': { 'name': 'resources', 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'], 'length': len(resources), }, } metadata = metadata.update((), dataset_metadata) dataset = Dataset(resources, metadata) return dataset
def test_update_with_generated_metadata(self): metadata = base.DataMetadata({ 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': container.ndarray, }) cells_metadata = collections.OrderedDict() cells_metadata[('a',)] = {'other': 1} cells_metadata[('b',)] = {'other': 2} cells_metadata[('c',)] = {'other': 3} cells_metadata[(base.ALL_ELEMENTS,)] = {'foo': 'bar'} cells_metadata[('other', 'a')] = {'other': 4} cells_metadata[('other', 'b')] = {'other': 5} cells_metadata[('other', 'c')] = {'other': 6} cells_metadata[('other', base.ALL_ELEMENTS)] = {'foo': 'bar2'} metadata._update_with_generated_metadata(cells_metadata) self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.numpy.ndarray', }, }, { 'selector': ['__ALL_ELEMENTS__'], 'metadata': {'foo': 'bar'}, }, { 'selector': ['a'], 'metadata': {'other': 1}, }, { 'selector': ['b'], 'metadata': {'other': 2}, }, { 'selector': ['c'], 'metadata': {'other': 3}, }, { 'selector': ['other', '__ALL_ELEMENTS__'], 'metadata': {'foo': 'bar2'}, }, { 'selector': ['other', 'a'], 'metadata': {'other': 4}, }, { 'selector': ['other', 'b'], 'metadata': {'other': 5}, }, { 'selector': ['other', 'c'], 'metadata': {'other': 6}, }]) metadata = base.DataMetadata({ 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': container.ndarray, 'semantic_types': ['http://example.com/Type1'], 'dimension': { 'length': 0, 'foobar': 42, 'semantic_types': ['http://example.com/Type2'], } }) metadata = metadata.update(('a',), { 'semantic_types': ['http://example.com/Type3'], 'dimension': { 'length': 0, 'foobar': 45, 'semantic_types': ['http://example.com/Type4'], } }) cells_metadata = collections.OrderedDict() cells_metadata[()] = { 'other': 1, 'structural_type': container.ndarray, 'semantic_types': ['http://example.com/Type1a'], 'dimension': { 'length': 100, 'name': 'test1', 'semantic_types': ['http://example.com/Type2a'], } } cells_metadata[('a',)] = { 'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'], 'dimension': { 'length': 200, 'name': 'test2', 'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'], } } cells_metadata[('b',)] = {'other': 2} metadata._update_with_generated_metadata(cells_metadata) self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.numpy.ndarray', 'other': 1, 'semantic_types': ['http://example.com/Type1', 'http://example.com/Type1a'], 'dimension': { 'length': 100, 'name': 'test1', 'foobar': 42, 'semantic_types': ['http://example.com/Type2', 'http://example.com/Type2a'], }, }, }, { 'selector': ['a'], 'metadata': { 'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'], 'dimension': { 'length': 200, 'name': 'test2', 'foobar': 45, 'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'], }, }, }, { 'selector': ['b'], 'metadata': {'other': 2}, }]) self.assertEqual(metadata.to_json_structure(), [{ 'selector': [], 'metadata': { 'schema': base.CONTAINER_SCHEMA_VERSION, 'structural_type': 'd3m.container.numpy.ndarray', 'other': 1, 'semantic_types': ['http://example.com/Type1', 'http://example.com/Type1a'], 'dimension': { 'length': 100, 'name': 'test1', 'foobar': 42, 'semantic_types': ['http://example.com/Type2', 'http://example.com/Type2a'], }, }, }, { 'selector': ['a'], 'metadata': { 'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'], 'dimension': { 'length': 200, 'name': 'test2', 'foobar': 45, 'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'], }, }, }, { 'selector': ['b'], 'metadata': {'other': 2}, }])