def _copy_elements_metadata( cls, source_metadata: metadata_base.DataMetadata, selector_prefix: metadata_base.Selector, selector: metadata_base.Selector, target_metadata: metadata_base.DataMetadata, *, source: typing.Any = None) -> metadata_base.DataMetadata: if source is None: source = cls elements = source_metadata.get_elements( list(selector_prefix) + list(selector)) for element in elements: new_selector = list(selector) + [element] metadata = source_metadata.query( list(selector_prefix) + new_selector) target_metadata = target_metadata.update(new_selector, metadata, source=source) target_metadata = cls._copy_elements_metadata(source_metadata, selector_prefix, new_selector, target_metadata, source=source) return target_metadata
def get_target_columns(metadata: DataMetadata): target_columns = [] is_dataframe = metadata.query(())['structural_type'] == DataFrame # if not is_dataframe: # n_resources = metadata.query(())['dimension']['length'] # resource_to_use = n_resources - 1 # if n_resources > 1: # # find learning data resource # resource_to_use = [res_id for res_id in range(n_resources) # if D3MMetadataTypes.EntryPoint in metadata.query( # (str(res_id), ))['semantic_types']][0] # ncolumns = metadata.query((str(resource_to_use), ALL_ELEMENTS))['dimension']['length'] if not is_dataframe: resources = metadata.get_elements(()) if len(resources) > 1: # find learning data resource for resource in resources: if D3MMetadataTypes.EntryPoint in metadata.query( (resource, ))['semantic_types']: resource_to_use = resource break else: resource_to_use = resources[0] ncolumns = metadata.query( (resource_to_use, ALL_ELEMENTS))['dimension']['length'] else: ncolumns = metadata.query((ALL_ELEMENTS, ))['dimension']['length'] for column_index in range(ncolumns): if is_dataframe: column_metadata = metadata.query((ALL_ELEMENTS, column_index)) else: column_metadata = metadata.query( (resource_to_use, ALL_ELEMENTS, column_index)) semantic_types = column_metadata.get('semantic_types', []) if D3MMetadataTypes.TrueTarget in semantic_types: column_name = column_metadata['name'] target_columns.append(column_name) return target_columns
def get_tabular_resource_metadata( dataset: metadata_base.DataMetadata, resource_id: typing.Optional[metadata_base.SelectorSegment], *, pick_entry_point: bool = True, pick_one: bool = True, ) -> metadata_base.SelectorSegment: if resource_id is None and pick_entry_point: # This can be also "ALL_ELEMENTS" and it will work out, but we prefer a direct resource ID, # if available. So we reverse the list, because the first is "ALL_ELEMENTS" if it exists. for dataset_resource_id in reversed(dataset.get_elements(())): if dataset.has_semantic_type(( dataset_resource_id, ), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint' ): resource_id = dataset_resource_id break if resource_id is None and pick_one: # This can be also "ALL_ELEMENTS" and it will work out, but we prefer a direct resource ID, # if available. So we reverse the list, because the first is "ALL_ELEMENTS" if it exists. tabular_resource_ids = [] for dataset_resource_id in reversed(dataset.get_elements(())): dataset_resource_type = dataset.query( (dataset_resource_id, )).get('structural_type', None) if dataset_resource_type is None: continue if issubclass(dataset_resource_type, container.DataFrame): tabular_resource_ids.append(dataset_resource_id) if len(tabular_resource_ids) == 1: resource_id = tabular_resource_ids[0] if resource_id is None: if pick_entry_point and pick_one: raise ValueError( "A Dataset with multiple tabular resources without an entry point and no DataFrame resource specified as a hyper-parameter." ) elif pick_entry_point: raise ValueError( "A Dataset without an entry point and no DataFrame resource specified as a hyper-parameter." ) elif pick_one: raise ValueError( "A Dataset with multiple tabular resources and no DataFrame resource specified as a hyper-parameter." ) else: raise ValueError( "No DataFrame resource specified as a hyper-parameter.") else: resource_type = dataset.query((resource_id, ))['structural_type'] if not issubclass(resource_type, container.DataFrame): raise TypeError( "The Dataset resource '{resource_id}' is not a DataFrame, but '{type}'." .format( resource_id=resource_id, type=resource_type, )) return resource_id