Ejemplo n.º 1
0
    def __init__(self, iterable: typing.Iterable = (), metadata: typing.Dict[str, typing.Any] = None, *,
                 generate_metadata: bool = False, check: bool = True, source: typing.Any = None,
                 timestamp: datetime.datetime = None) -> None:
        if isinstance(iterable, pandas.DataFrame):
            super().__init__(type(self)(row) for row in iterable.itertuples(index=False, name=None))
        else:
            if isinstance(iterable, numpy.matrix):
                # One cannot iterate over a matrix segment by segment. You always get back
                # a matrix (2D structure) and not an array of rows or columns. By converting
                # it to an array such iteration segment by segment works.
                iterable = numpy.array(iterable)
            super().__init__(iterable)

        from d3m import types

        if isinstance(iterable, types.Container):
            if isinstance(iterable, List):
                # We made a copy, so we do not have to generate metadata.
                self.metadata: metadata_base.DataMetadata = iterable.metadata
            else:
                self.metadata: metadata_base.DataMetadata = iterable.metadata
                if generate_metadata:
                    self.metadata = self.metadata.generate(self)

            if metadata is not None:
                self.metadata: metadata_base.DataMetadata = self.metadata.update((), metadata)
        else:
            self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata(metadata)
            if generate_metadata:
                self.metadata = self.metadata.generate(self)
    def _update_metadata(
        cls,
        metadata: metadata_base.DataMetadata,
        resource_id: metadata_base.SelectorSegment,
    ) -> metadata_base.DataMetadata:
        resource_metadata = dict(metadata.query((resource_id, )))

        if "structural_type" not in resource_metadata or not issubclass(
                resource_metadata["structural_type"], container.DataFrame):
            raise TypeError(
                'The Dataset resource is not a DataFrame, but "{type}".'.
                format(type=resource_metadata.get("structural_type", None), ))

        resource_metadata.update(
            {
                "schema": metadata_base.CONTAINER_SCHEMA_VERSION,
            }, )

        new_metadata = metadata_base.DataMetadata(resource_metadata)

        new_metadata = metadata.copy_to(new_metadata, (resource_id, ))

        # Resource is not anymore an entry point.
        new_metadata = new_metadata.remove_semantic_type(
            (),
            "https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint")

        return new_metadata
Ejemplo n.º 3
0
    def _update_metadata(
        self, metadata: metadata_base.DataMetadata,
        resource_id: metadata_base.SelectorSegment
    ) -> metadata_base.DataMetadata:
        resource_metadata = dict(metadata.query((resource_id, )))

        if 'structural_type' not in resource_metadata or not issubclass(
                resource_metadata['structural_type'], container.DataFrame):
            raise TypeError(
                "The Dataset resource is not a DataFrame, but \"{type}\".".
                format(type=resource_metadata.get('structural_type', None), ))

        resource_metadata.update(
            {
                'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
            }, )

        new_metadata = metadata_base.DataMetadata(resource_metadata)

        new_metadata = metadata.copy_to(new_metadata, (resource_id, ))

        # Resource is not anymore an entry point.
        new_metadata = new_metadata.remove_semantic_type(
            (),
            'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint')

        return new_metadata
Ejemplo n.º 4
0
    def __finalize__(self: D,
                     other: typing.Any,
                     method: str = None,
                     **kwargs: typing.Any) -> D:
        self = super().__finalize__(other, method, **kwargs)

        # Merge operation: using metadata of the left object.
        if method == 'merge':
            obj = other.left
        # Concat operation: using metadata of the first object.
        elif method == 'concat':
            obj = other.objs[0]
        else:
            obj = other

        if isinstance(obj, DataFrame):
            # TODO: We could adapt (if this is after a slice) metadata instead of just copying?
            self.metadata: metadata_base.DataMetadata = obj.metadata
        # "metadata" attribute should already be set in "__init__",
        # but if we got here without it, let's set it now.
        elif not hasattr(self, 'metadata'):
            self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata(
            )

        return self
Ejemplo n.º 5
0
    def __new__(
        cls: typing.Type[N],
        input_array: typing.Sequence,
        metadata: typing.Dict[str, typing.Any] = None,
        *,
        dtype: typing.Union[numpy.dtype, str] = None,
        order: typing.Any = None,
        generate_metadata: bool = False,
        check: bool = True,
        source: typing.Any = None,
        timestamp: datetime.datetime = None,
    ) -> N:
        array = numpy.asarray(input_array, dtype=dtype, order=order).view(cls)

        # Importing here to prevent import cycle.
        from d3m import types

        if isinstance(input_array, types.Container):
            if isinstance(input_array, ndarray):
                # We made a copy, so we do not have to generate metadata.
                array.metadata = input_array.metadata
            else:
                array.metadata = input_array.metadata
                if generate_metadata:
                    array.metadata = array.metadata.generate(array)

            if metadata is not None:
                array.metadata = array.metadata.update((), metadata)
        else:
            array.metadata = metadata_base.DataMetadata(metadata)
            if generate_metadata:
                array.metadata = array.metadata.generate(array)

        return array
Ejemplo n.º 6
0
    def _produce(self, inputs: DatasetSplitInputs, is_train: bool) -> base.CallResult[DatasetSplitOutputs]:
        """
        This function splits the fitted Dataset.

        Parameters
        ----------
        inputs:
            A list of 0-based indices which specify which splits to be used as test split in output.
        is_train:
            Whether we are producing train or test data.

        Returns
        -------
        Returns a list of Datasets.
        """

        if not self._fitted or self._splits is None or self._dataset is None or self._main_resource_id is None or self._graph is None:
            raise exceptions.PrimitiveNotFittedError("Primitive not fitted.")

        output_datasets = container.List(generate_metadata=True)

        for index in inputs:
            train_indices, test_indices = self._splits[index]

            if is_train:
                output_dataset = base_utils.sample_rows(
                    self._dataset,
                    self._main_resource_id,
                    set(train_indices),
                    self._graph,
                    delete_recursive=self.hyperparams.get('delete_recursive', False),
                )
            else:
                output_dataset = base_utils.sample_rows(
                    self._dataset,
                    self._main_resource_id,
                    set(test_indices),
                    self._graph,
                    delete_recursive=self.hyperparams.get('delete_recursive', False),
                )

            output_datasets.append(output_dataset)

        output_datasets.metadata = metadata_base.DataMetadata({
            'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
            'structural_type': container.List,
            'dimension': {
                'length': len(output_datasets),
            },
        })

        # We update metadata based on metadata of each dataset.
        # TODO: In the future this might be done automatically by generate_metadata.
        #       See: https://gitlab.com/datadrivendiscovery/d3m/issues/119
        for index, dataset in enumerate(output_datasets):
            output_datasets.metadata = dataset.metadata.copy_to(output_datasets.metadata, (), (index,))

        return base.CallResult(output_datasets)
    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

        return outputs_metadata
Ejemplo n.º 8
0
    def _update_predictions_metadata(self, outputs: Optional[Outputs], target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
        outputs_metadata = metadata_base.DataMetadata()
        if outputs is not None:
            outputs_metadata = outputs_metadata.generate(outputs)

        for column_index, column_metadata in enumerate(target_columns_metadata):
            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

        return outputs_metadata
Ejemplo n.º 9
0
    def __array_finalize__(self, obj: typing.Any) -> None:
        # If metadata attribute already exists.
        if hasattr(self, 'metadata'):
            return

        if obj is not None and isinstance(obj, ndarray) and hasattr(obj, 'metadata'):
            # TODO: We could adapt (if this is after a slice) metadata instead of just copying?
            self.metadata: metadata_base.DataMetadata = obj.metadata
        else:
            self.metadata = metadata_base.DataMetadata()
Ejemplo n.º 10
0
def get_dataframe(dataset: container.Dataset, resource_id: str) -> container.DataFrame:
    # extracts a dataframe from a dataset and ensures its metadata is transferred over

    # grab the resource and its metadata out of the dataset
    dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id)
    resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,)))
    # copy the resource metadata from the dataset into the resource
    new_metadata = metadata_base.DataMetadata(resource_metadata)
    new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,))
    new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint')
    dataframe.metadata = new_metadata

    return dataframe
Ejemplo n.º 11
0
    def __init__(self,
                 data: Data = None,
                 metadata: typing.Dict[str, typing.Any] = None,
                 index: typing.Union[pandas.Index, Data] = None,
                 columns: typing.Union[pandas.Index, Data] = None,
                 dtype: typing.Union[numpy.dtype, str,
                                     pandas_common.ExtensionDtype] = None,
                 copy: bool = False,
                 *,
                 generate_metadata: bool = False,
                 check: bool = True,
                 source: typing.Any = None,
                 timestamp: datetime.datetime = None) -> None:
        # If not a constructor call to this exact class, then a child constructor
        # is responsible to call a pandas constructor.
        if type(self) is DataFrame:
            pandas.DataFrame.__init__(self,
                                      data=convert_ndarray(
                                          convert_lists(data)),
                                      index=index,
                                      columns=columns,
                                      dtype=dtype,
                                      copy=copy)

        # Importing here to prevent import cycle.
        from d3m import types

        if isinstance(data, types.Container):  # type: ignore
            if isinstance(data, DataFrame):
                # We made a copy, so we do not have to generate metadata.
                self.metadata: metadata_base.DataMetadata = data.metadata
            else:
                self.metadata: metadata_base.DataMetadata = data.metadata
                if generate_metadata:
                    self.metadata = self.metadata.generate(self)

            if metadata is not None:
                self.metadata: metadata_base.DataMetadata = self.metadata.update(
                    (), metadata)
        else:
            self.metadata: metadata_base.DataMetadata = metadata_base.DataMetadata(
                metadata)
            if generate_metadata:
                self.metadata = self.metadata.generate(self)
Ejemplo n.º 12
0
    def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
                                     target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
        """
        Updata metadata for selected columns.
        Args:
            inputs_metadata: metadata_base.DataMetadata
            outputs: Container Dataframe
            target_columns_metadata: list

        Returns:
            d3m.metadata.base.DataMetadata
        """
        outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

        for column_index, column_metadata in enumerate(target_columns_metadata):
            column_metadata.pop("structural_type", None)
            outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

        return outputs_metadata
Ejemplo n.º 13
0
def get_dataframe(dataset: container.Dataset, resource_id: str, target_col: int) -> container.DataFrame:
    """ extracts a dataframe from a dataset and ensures its metadata is transferred over """

    # grab the resource and its metadata out of the dataset
    dataframe_resource_id, dataframe = base_utils.get_tabular_resource(dataset, resource_id)
    resource_metadata = dict(dataset.metadata.query((dataframe_resource_id,)))

    # copy the resource metadata from the dataset into the resource
    new_metadata = metadata_base.DataMetadata(resource_metadata)
    new_metadata = dataset.metadata.copy_to(new_metadata, (resource_id,))
    new_metadata = new_metadata.remove_semantic_type((), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint')

    # add target metadata to specified column
    new_metadata = new_metadata.add_semantic_type(
        (metadata_base.ALL_ELEMENTS, target_col),
        'https://metadata.datadrivendiscovery.org/types/TrueTarget'
    )
    dataframe.metadata = new_metadata
    return dataframe
Ejemplo n.º 14
0
def get_dataset(input_data, target_index=-2, index_column=-1, semantic_types=None, parse=False, media_dir=None):
    """
    A function that has as input a dataframe, and generates a D3M dataset.

    Parameters
    ----------
    input_data : pd.DataFrame
        The dataframe to be converted to d3m Dataset.
    target_index : int
        The index of the target, if index is not present, it will be ignored.
    index_column : int
        The index of the index target, if not provided it will look for d3m index, if not generate one.
    semantic_types : Sequence[Sequence[str]]
        A list of semantic types to be applied. The sequence must be of the same length of
        the dataframe columns.
    parse :
        A flag to determine if the dataset will contain parsed columns. By default is set to fault
        to make it compatible with most of D3M current infrastructure.
    media_dir : str
        The absolute path of the directory containing the image/video/csv files, if not present, it will be ignored

    Returns
    -------
    A D3M dataset.
    """
    data = make_unique_columns(input_data.copy(deep=True))
    if semantic_types is None:
        semantic_types = [[] for i in range(len(data.columns))]
        for i, _type in enumerate(input_data.dtypes):
            if _type == float:
                semantic_types[i].append('http://schema.org/Float')
            elif _type == int:
                semantic_types[i].append('http://schema.org/Integer')

    resources = {}

    if 'd3mIndex' in data.columns:
        index_column = list(data.columns).index("d3mIndex")
    else:
        if index_column == -1:
            data.insert(0, 'd3mIndex', range(len(data)))
            semantic_types.insert(0, [])
            target_index += 1
            index_column = 0

    data = container_pandas.DataFrame(data)

    # remove this
    if not parse:
        data = data.astype(str)
    metadata = metadata_base.DataMetadata()

    resources['learningData'] = data

    metadata = metadata.update(('learningData',), {
        'structural_type': type(data),
        'semantic_types': [
            'https://metadata.datadrivendiscovery.org/types/Table',
            'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint',
        ],
        'dimension': {
            'name': 'rows',
            'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularRow'],
            'length': len(data),
        },
    })

    metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS), {
        'dimension': {
            'name': 'columns',
            'semantic_types': ['https://metadata.datadrivendiscovery.org/types/TabularColumn'],
            'length': len(data.columns),
        },
    })

    for i, column_name in enumerate(data.columns):
        if i == index_column:
            metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
                'name': column_name,
                'structural_type': numpy.int64,
                'semantic_types': [
                    'http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/PrimaryKey',
                ],
            })
        else:
            _structural_type = str
            if semantic_types[i]:
                _semantic_types = semantic_types[i]
                if 'http://schema.org/Float' in _semantic_types:
                    _structural_type = numpy.float64
                elif 'http://schema.org/Integer' in _semantic_types:
                    _structural_type = numpy.int64
            else:
                _semantic_types = ['https://metadata.datadrivendiscovery.org/types/UnknownType']

            if not parse:
                _structural_type = str
            if i == target_index:
                _semantic_types += ['https://metadata.datadrivendiscovery.org/types/SuggestedTarget']
            else:
                _semantic_types += ['https://metadata.datadrivendiscovery.org/types/Attribute']

            # Add media dir if any
            if media_dir is not None and i != target_index:
                # Check the type of the first path
                first_file_path = data.iloc[0, i]
                suffix = first_file_path.split('.')[-1]
                if suffix in ['png', 'jpg']:
                    media_type = 'image'
                elif suffix in ['mp4', 'avi']:
                    media_type = 'video'
                else:
                    media_type = 'text'

                _semantic_types += ["https://metadata.datadrivendiscovery.org/types/FileName"]
                metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
                    'name': column_name,
                    'structural_type': str,
                    'semantic_types': _semantic_types,
                    "location_base_uris": [pathlib.Path(media_dir).as_uri()+'/'],
                    "media_types": [
                        media_type+"/"+suffix
                    ],
                })
            else:
                metadata = metadata.update(('learningData', metadata_base.ALL_ELEMENTS, i), {
                    'name': column_name,
                    'structural_type': _structural_type,
                    'semantic_types': _semantic_types,
                })

    dataset_id = str(uuid.uuid4())
    dataset_metadata = {
        'schema': metadata_base.CONTAINER_SCHEMA_VERSION,
        'structural_type': Dataset,
        'id': dataset_id,
        'name': dataset_id,
        'digest': str(uuid.uuid4()),
        'dimension': {
            'name': 'resources',
            'semantic_types': ['https://metadata.datadrivendiscovery.org/types/DatasetResource'],
            'length': len(resources),
        },
    }

    metadata = metadata.update((), dataset_metadata)

    dataset = Dataset(resources, metadata)
    return dataset
Ejemplo n.º 15
0
    def test_update_with_generated_metadata(self):
        metadata = base.DataMetadata({
            'schema': base.CONTAINER_SCHEMA_VERSION,
            'structural_type': container.ndarray,
        })

        cells_metadata = collections.OrderedDict()
        cells_metadata[('a',)] = {'other': 1}
        cells_metadata[('b',)] = {'other': 2}
        cells_metadata[('c',)] = {'other': 3}
        cells_metadata[(base.ALL_ELEMENTS,)] = {'foo': 'bar'}
        cells_metadata[('other', 'a')] = {'other': 4}
        cells_metadata[('other', 'b')] = {'other': 5}
        cells_metadata[('other', 'c')] = {'other': 6}
        cells_metadata[('other', base.ALL_ELEMENTS)] = {'foo': 'bar2'}

        metadata._update_with_generated_metadata(cells_metadata)

        self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.numpy.ndarray',
            },
        }, {
            'selector': ['__ALL_ELEMENTS__'],
            'metadata': {'foo': 'bar'},
        }, {
            'selector': ['a'],
            'metadata': {'other': 1},
        }, {
            'selector': ['b'],
            'metadata': {'other': 2},
        }, {
            'selector': ['c'],
            'metadata': {'other': 3},
        }, {
            'selector': ['other', '__ALL_ELEMENTS__'],
            'metadata': {'foo': 'bar2'},
        }, {
            'selector': ['other', 'a'],
            'metadata': {'other': 4},
        }, {
            'selector': ['other', 'b'],
            'metadata': {'other': 5},
        }, {
            'selector': ['other', 'c'],
            'metadata': {'other': 6},
        }])

        metadata = base.DataMetadata({
            'schema': base.CONTAINER_SCHEMA_VERSION,
            'structural_type': container.ndarray,
            'semantic_types': ['http://example.com/Type1'],
            'dimension': {
                'length': 0,
                'foobar': 42,
                'semantic_types': ['http://example.com/Type2'],
            }
        })

        metadata = metadata.update(('a',), {
            'semantic_types': ['http://example.com/Type3'],
            'dimension': {
                'length': 0,
                'foobar': 45,
                'semantic_types': ['http://example.com/Type4'],
            }
        })

        cells_metadata = collections.OrderedDict()
        cells_metadata[()] = {
            'other': 1,
            'structural_type': container.ndarray,
            'semantic_types': ['http://example.com/Type1a'],
            'dimension': {
                'length': 100,
                'name': 'test1',
                'semantic_types': ['http://example.com/Type2a'],
            }
        }
        cells_metadata[('a',)] = {
            'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'],
            'dimension': {
                'length': 200,
                'name': 'test2',
                'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'],
            }
        }
        cells_metadata[('b',)] = {'other': 2}

        metadata._update_with_generated_metadata(cells_metadata)

        self.assertEqual(utils.to_json_structure(metadata.to_internal_simple_structure()), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.numpy.ndarray',
                'other': 1,
                'semantic_types': ['http://example.com/Type1', 'http://example.com/Type1a'],
                'dimension': {
                    'length': 100,
                    'name': 'test1',
                    'foobar': 42,
                    'semantic_types': ['http://example.com/Type2', 'http://example.com/Type2a'],
                },
            },
        }, {
            'selector': ['a'],
            'metadata': {
                'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'],
                'dimension': {
                    'length': 200,
                    'name': 'test2',
                    'foobar': 45,
                    'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'],
                },
            },
        }, {
            'selector': ['b'],
            'metadata': {'other': 2},
        }])

        self.assertEqual(metadata.to_json_structure(), [{
            'selector': [],
            'metadata': {
                'schema': base.CONTAINER_SCHEMA_VERSION,
                'structural_type': 'd3m.container.numpy.ndarray',
                'other': 1,
                'semantic_types': ['http://example.com/Type1', 'http://example.com/Type1a'],
                'dimension': {
                    'length': 100,
                    'name': 'test1',
                    'foobar': 42,
                    'semantic_types': ['http://example.com/Type2', 'http://example.com/Type2a'],
                },
            },
        }, {
            'selector': ['a'],
            'metadata': {
                'semantic_types': ['http://example.com/Type3', 'http://example.com/Type3a'],
                'dimension': {
                    'length': 200,
                    'name': 'test2',
                    'foobar': 45,
                    'semantic_types': ['http://example.com/Type4', 'http://example.com/Type4a'],
                },
            },
        }, {
            'selector': ['b'],
            'metadata': {'other': 2},
        }])