def _add_column(
        self, main_resource_id: str, data: pandas.DataFrame,
        metadata: metadata_base.DataMetadata, column_data: pandas.DataFrame,
        column_metadata: typing.Dict
    ) -> typing.Tuple[pandas.DataFrame, metadata_base.DataMetadata]:

        assert column_data.shape[1] == 1

        if data is None:
            data = column_data
        else:
            #import pdb
            #pdb.set_trace()
            #data = data.reset_index().drop(columns=['index'])
            column_data = column_data.set_index(data.index)
            #column_data = column_data.reset_index().drop(columns=['index'])
            data = pandas.concat([data, column_data], axis=1)
            '''
            data = data.reset_index().drop(columns=['index'])
            selected_data_key = column_data.columns
            for each_key in selected_data_key:
                data[each_key] = column_data[each_key]
            '''
        metadata = metadata.update(
            (main_resource_id, metadata_base.ALL_ELEMENTS, data.shape[1] - 1),
            column_metadata,
            source=self)

        return data, metadata
Beispiel #2
0
    def _copy_elements_metadata(
            cls,
            source_metadata: metadata_base.DataMetadata,
            selector_prefix: metadata_base.Selector,
            selector: metadata_base.Selector,
            target_metadata: metadata_base.DataMetadata,
            *,
            source: typing.Any = None) -> metadata_base.DataMetadata:

        if source is None:
            source = cls

        elements = source_metadata.get_elements(
            list(selector_prefix) + list(selector))

        for element in elements:
            new_selector = list(selector) + [element]
            metadata = source_metadata.query(
                list(selector_prefix) + new_selector)

            target_metadata = target_metadata.update(new_selector,
                                                     metadata,
                                                     source=source)
            target_metadata = cls._copy_elements_metadata(source_metadata,
                                                          selector_prefix,
                                                          new_selector,
                                                          target_metadata,
                                                          source=source)

        return target_metadata
    def _join_by_index(self, main_resource_id: str, inputs: Inputs, inputs_column_index: int, data: typing.Optional[pandas.DataFrame],
                       metadata: metadata_base.DataMetadata, foreign_resource_id: str, foreign_column_index: int) -> typing.Tuple[pandas.DataFrame, metadata_base.DataMetadata]:
        main_column_metadata = inputs.metadata.query((main_resource_id, metadata_base.ALL_ELEMENTS, inputs_column_index))

        main_data = inputs[main_resource_id]
        foreign_data = inputs[foreign_resource_id]

        value_to_index = {}
        for value_index, value in enumerate(foreign_data.iloc[:, foreign_column_index]):
            # TODO: Check if values are not unique.
            value_to_index[value] = value_index
        rows = []
        for value in main_data.iloc[:, inputs_column_index]:
            rows.append([foreign_data.iloc[value_to_index[value], j] for j in range(len(foreign_data.columns))])
        if data is None:
            data_columns_length = 0
        else:
            data_columns_length = data.shape[1]

        # Copy over metadata.
        foreign_data_columns_length = inputs.metadata.query((foreign_resource_id, metadata_base.ALL_ELEMENTS))['dimension']['length']
        for column_index in range(foreign_data_columns_length):
            column_metadata = dict(inputs.metadata.query((foreign_resource_id, metadata_base.ALL_ELEMENTS, column_index)))

            # Foreign keys can reference same foreign row multiple times, so values in this column might not be even
            # unique anymore, nor they are a primary key at all. Sso we remove the semantic type marking a column as such.
            if 'https://metadata.datadrivendiscovery.org/types/PrimaryKey' in column_metadata.get('semantic_types', []):
                column_metadata['semantic_types'] = [
                    semantic_type for semantic_type in column_metadata['semantic_types'] if semantic_type != 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'
                ]

            # If the original index column was an attribute, make sure the new index column is as well.
            if 'https://metadata.datadrivendiscovery.org/types/Attribute' in main_column_metadata.get('semantic_types', []):
                if 'https://metadata.datadrivendiscovery.org/types/Attribute' not in column_metadata['semantic_types']:
                    column_metadata['semantic_types'].append('https://metadata.datadrivendiscovery.org/types/Attribute')

            # If the original index column was a suggested target, make sure the new index column is as well.
            if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in main_column_metadata.get('semantic_types', []):
                if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' not in column_metadata['semantic_types']:
                    column_metadata['semantic_types'].append('https://metadata.datadrivendiscovery.org/types/SuggestedTarget')

            metadata = metadata.update((main_resource_id, metadata_base.ALL_ELEMENTS, data_columns_length + column_index), column_metadata, source=self)

        selected_data = pandas.DataFrame(rows)
        if data is None:
            data = selected_data
        else:
            #import pdb
            #pdb.set_trace()
            #data = data.reset_index().drop(columns=['index'])
            selected_data = selected_data.set_index(data.index)
            #selected_data = selected_data.reset_index().drop(columns=['index'])
            data = pandas.concat([data, selected_data], axis=1, ignore_index=True)
        return data, metadata
 def _add_target_semantic_types(cls, metadata: metadata_base.DataMetadata,
                                source: typing.Any,  target_names: typing.List = None,) -> metadata_base.DataMetadata:
     for column_index in range(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']):
         metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index),
                                               'https://metadata.datadrivendiscovery.org/types/Target',
                                               source=source)
         metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index),
                                               'https://metadata.datadrivendiscovery.org/types/PredictedTarget',
                                               source=source)
         if target_names:
             metadata = metadata.update((metadata_base.ALL_ELEMENTS, column_index), {
                 'name': target_names[column_index],
             }, source=source)
     return metadata
Beispiel #5
0
    def _select_columns_metadata(
            cls, inputs_metadata: metadata_base.DataMetadata,
            resource_id: metadata_base.SelectorSegment,
            columns: typing.Sequence[int]) -> metadata_base.DataMetadata:
        """
        This is similar to ``select_columns_metadata`` but operates on a Dataset.
        """

        if not columns:
            raise exceptions.InvalidArgumentValueError("No columns selected.")

        # This makes a copy so that we can modify metadata in-place.
        output_metadata = inputs_metadata.update(
            (
                resource_id,
                metadata_base.ALL_ELEMENTS,
            ),
            {
                'dimension': {
                    'length': len(columns),
                },
            },
        )

        if resource_id is metadata_base.ALL_ELEMENTS:
            metadata_chain = itertools.chain(
                [
                    output_metadata._current_metadata.all_elements.all_elements
                    if output_metadata._current_metadata.all_elements
                    is not None else None
                ],
                output_metadata._current_metadata.all_elements.elements.values(
                ) if output_metadata._current_metadata.all_elements is not None
                else iter([None]),
            )
        else:
            resource_id = typing.cast(metadata_base.SimpleSelectorSegment,
                                      resource_id)

            metadata_chain = itertools.chain(
                [
                    output_metadata._current_metadata.all_elements.all_elements
                    if output_metadata._current_metadata.all_elements
                    is not None else None
                ],
                output_metadata._current_metadata.all_elements.elements.values(
                ) if output_metadata._current_metadata.all_elements is not None
                else iter([None]),
                [
                    output_metadata._current_metadata.elements[resource_id].
                    all_elements
                ],
                output_metadata._current_metadata.elements[resource_id].
                elements.values(),
            )

        # TODO: Do this better. This change is missing an entry in metadata log.
        for element_metadata_entry in metadata_chain:
            if element_metadata_entry is None:
                continue

            elements = element_metadata_entry.elements
            element_metadata_entry.elements = {}
            for i, column_index in enumerate(columns):
                if column_index in elements:
                    # If "column_index" is really numeric, we re-enumerate it.
                    if isinstance(column_index, int):
                        element_metadata_entry.elements[i] = elements[
                            column_index]
                    else:
                        element_metadata_entry.elements[
                            column_index] = elements[column_index]

        return output_metadata