Example #1
0
    def _copy_elements_metadata(
            cls,
            source_metadata: metadata_base.DataMetadata,
            selector_prefix: metadata_base.Selector,
            selector: metadata_base.Selector,
            target_metadata: metadata_base.DataMetadata,
            *,
            source: typing.Any = None) -> metadata_base.DataMetadata:

        if source is None:
            source = cls

        elements = source_metadata.get_elements(
            list(selector_prefix) + list(selector))

        for element in elements:
            new_selector = list(selector) + [element]
            metadata = source_metadata.query(
                list(selector_prefix) + new_selector)

            target_metadata = target_metadata.update(new_selector,
                                                     metadata,
                                                     source=source)
            target_metadata = cls._copy_elements_metadata(source_metadata,
                                                          selector_prefix,
                                                          new_selector,
                                                          target_metadata,
                                                          source=source)

        return target_metadata
Example #2
0
def get_target_columns(metadata: DataMetadata):
    target_columns = []
    is_dataframe = metadata.query(())['structural_type'] == DataFrame
    # if not is_dataframe:
    #     n_resources = metadata.query(())['dimension']['length']
    #     resource_to_use = n_resources - 1
    #     if n_resources > 1:
    #         # find learning data resource
    #         resource_to_use = [res_id for res_id in range(n_resources)
    #                            if D3MMetadataTypes.EntryPoint in metadata.query(
    #                                (str(res_id), ))['semantic_types']][0]
    #     ncolumns = metadata.query((str(resource_to_use), ALL_ELEMENTS))['dimension']['length']

    if not is_dataframe:
        resources = metadata.get_elements(())
        if len(resources) > 1:
            # find learning data resource
            for resource in resources:
                if D3MMetadataTypes.EntryPoint in metadata.query(
                    (resource, ))['semantic_types']:
                    resource_to_use = resource
                    break

        else:
            resource_to_use = resources[0]

        ncolumns = metadata.query(
            (resource_to_use, ALL_ELEMENTS))['dimension']['length']
    else:
        ncolumns = metadata.query((ALL_ELEMENTS, ))['dimension']['length']

    for column_index in range(ncolumns):
        if is_dataframe:
            column_metadata = metadata.query((ALL_ELEMENTS, column_index))
        else:
            column_metadata = metadata.query(
                (resource_to_use, ALL_ELEMENTS, column_index))

        semantic_types = column_metadata.get('semantic_types', [])
        if D3MMetadataTypes.TrueTarget in semantic_types:
            column_name = column_metadata['name']
            target_columns.append(column_name)

    return target_columns
Example #3
0
def get_tabular_resource_metadata(
    dataset: metadata_base.DataMetadata,
    resource_id: typing.Optional[metadata_base.SelectorSegment],
    *,
    pick_entry_point: bool = True,
    pick_one: bool = True,
) -> metadata_base.SelectorSegment:
    if resource_id is None and pick_entry_point:
        # This can be also "ALL_ELEMENTS" and it will work out, but we prefer a direct resource ID,
        # if available. So we reverse the list, because the first is "ALL_ELEMENTS" if it exists.
        for dataset_resource_id in reversed(dataset.get_elements(())):
            if dataset.has_semantic_type((
                    dataset_resource_id,
            ), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint'
                                         ):
                resource_id = dataset_resource_id
                break

    if resource_id is None and pick_one:
        # This can be also "ALL_ELEMENTS" and it will work out, but we prefer a direct resource ID,
        # if available. So we reverse the list, because the first is "ALL_ELEMENTS" if it exists.
        tabular_resource_ids = []
        for dataset_resource_id in reversed(dataset.get_elements(())):
            dataset_resource_type = dataset.query(
                (dataset_resource_id, )).get('structural_type', None)

            if dataset_resource_type is None:
                continue

            if issubclass(dataset_resource_type, container.DataFrame):
                tabular_resource_ids.append(dataset_resource_id)

        if len(tabular_resource_ids) == 1:
            resource_id = tabular_resource_ids[0]

    if resource_id is None:
        if pick_entry_point and pick_one:
            raise ValueError(
                "A Dataset with multiple tabular resources without an entry point and no DataFrame resource specified as a hyper-parameter."
            )
        elif pick_entry_point:
            raise ValueError(
                "A Dataset without an entry point and no DataFrame resource specified as a hyper-parameter."
            )
        elif pick_one:
            raise ValueError(
                "A Dataset with multiple tabular resources and no DataFrame resource specified as a hyper-parameter."
            )
        else:
            raise ValueError(
                "No DataFrame resource specified as a hyper-parameter.")

    else:
        resource_type = dataset.query((resource_id, ))['structural_type']

    if not issubclass(resource_type, container.DataFrame):
        raise TypeError(
            "The Dataset resource '{resource_id}' is not a DataFrame, but '{type}'."
            .format(
                resource_id=resource_id,
                type=resource_type,
            ))

    return resource_id