def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], outputs_metadata: metadata_base.DataMetadata, hyperparams): outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in input_indices: column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") if column_name is None: column_name = "output_{}".format(column_index) column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = set() add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) # If outputs has more columns than index, add Attribute Type to all remaining if outputs_length > len(input_indices): for column_index in range(len(input_indices), outputs_length): column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_name = "output_{}".format(column_index) column_metadata["semantic_types"] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _get_base_path(self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> str: # get the base uri from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_col_index = column_metadata['foreign_key']['column_index'] ref_res_id = column_metadata['foreign_key']['resource_id'] return inputs_metadata.query((ref_res_id, metadata_base.ALL_ELEMENTS, ref_col_index))['location_base_uris'][0]
def _test_metadata(self, metadata: metadata_base.DataMetadata, names: typing.Sequence[str]) -> None: self.assertEqual(metadata.query(())['dimension']['length'], 4) self.assertEqual( metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'], 5) self.assertEqual(names[0], metadata.query_column(3)['name']) self.assertEqual(self._target_semantic_types, set(metadata.query_column(3)['semantic_types'])) self.assertEqual(names[1], metadata.query_column(4)['name']) self.assertEqual(self._target_semantic_types, set(metadata.query_column(4)['semantic_types'])) return None
def _update_metadata( self, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment ) -> metadata_base.DataMetadata: resource_metadata = dict(metadata.query((resource_id, ))) if 'structural_type' not in resource_metadata or not issubclass( resource_metadata['structural_type'], container.DataFrame): raise TypeError( "The Dataset resource is not a DataFrame, but \"{type}\".". format(type=resource_metadata.get('structural_type', None), )) resource_metadata.update( { 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, }, ) new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = metadata.copy_to(new_metadata, (resource_id, )) # Resource is not anymore an entry point. new_metadata = new_metadata.remove_semantic_type( (), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') return new_metadata
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (str, ) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) return True if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False
def _get_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: """ Output metadata of selected columns. Args: outputs_metadata: metadata_base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: d3m.metadata.base.DataMetadata """ outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = [] add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _copy_elements_metadata( cls, source_metadata: metadata_base.DataMetadata, selector_prefix: metadata_base.Selector, selector: metadata_base.Selector, target_metadata: metadata_base.DataMetadata, *, source: typing.Any = None) -> metadata_base.DataMetadata: if source is None: source = cls elements = source_metadata.get_elements( list(selector_prefix) + list(selector)) for element in elements: new_selector = list(selector) + [element] metadata = source_metadata.query( list(selector_prefix) + new_selector) target_metadata = target_metadata.update(new_selector, metadata, source=source) target_metadata = cls._copy_elements_metadata(source_metadata, selector_prefix, new_selector, target_metadata, source=source) return target_metadata
def _add_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): # column_name = "output_{}".format(column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) # column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _get_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([ "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", ]) add_semantic_types = set([ "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ]) add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _update_metadata( cls, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment, ) -> metadata_base.DataMetadata: resource_metadata = dict(metadata.query((resource_id, ))) if "structural_type" not in resource_metadata or not issubclass( resource_metadata["structural_type"], container.DataFrame): raise TypeError( 'The Dataset resource is not a DataFrame, but "{type}".'. format(type=resource_metadata.get("structural_type", None), )) resource_metadata.update( { "schema": metadata_base.CONTAINER_SCHEMA_VERSION, }, ) new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = metadata.copy_to(new_metadata, (resource_id, )) # Resource is not anymore an entry point. new_metadata = new_metadata.remove_semantic_type( (), "https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint") return new_metadata
def _reassign_boundaries(self, inputs_metadata: metadata_base.DataMetadata, columns: typing.List[int]) -> metadata_base.DataMetadata: """ Moves metadata about boundaries from the filename column to image object column. """ outputs_metadata = inputs_metadata columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] for column_index in range(columns_length): column_metadata = outputs_metadata.query_column(column_index) if 'boundary_for' not in column_metadata: continue # TODO: Support also "column_name" boundary metadata. if 'column_index' not in column_metadata['boundary_for']: continue try: i = columns.index(column_metadata['boundary_for']['column_index']) except ValueError: continue outputs_metadata = outputs_metadata.update_column(column_index, { 'boundary_for': { # We know that "columns" were appended at the end. 'column_index': columns_length - len(columns) + i, } }) return outputs_metadata
def _get_target_columns_metadata( self, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = list(column_metadata.get('semantic_types', [])) if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) semantic_types = [ semantic_type for semantic_type in semantic_types if semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget' ] column_metadata['semantic_types'] = semantic_types target_columns_metadata.append(column_metadata) return target_columns_metadata
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict() semantic_types = [] semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") if column_name is None: column_name = "output_{}".format(column_index) column_metadata["semantic_types"] = semantic_types column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _get_ref_resource(self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> str: # get the referenced resource from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_res_id = column_metadata['foreign_key']['resource_id'] return ref_res_id
def _can_use_outputs_column(self, outputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: column_metadata = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get( 'semantic_types', [])
def _parse_metadata(cls, *, metadata: metadata_module.DataMetadata): flatten = lambda l: [item for sublist in l for item in sublist] mdlu = cls._init_metadata_lookup() num_res = metadata.query(())['dimension']['length'] resources = [str(x) for x in range(num_res - 1)] resources.append('learningData') #primary_key = [ [ (res_id, metadata_module.ALL_ELEMENTS, col_id) for x in range(metadata.query((res_id, metadata_module.ALL_ELEMENTS))['dimension']['length']) # if 'semantic_types' in metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id)) and primary_sem_type in metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id))['semantic_types'] ] # for res_id in resources ] primary_key = [[ (res_id, metadata_module.ALL_ELEMENTS, col_id) for col_id in range( metadata.query(( res_id, metadata_module.ALL_ELEMENTS))['dimension']['length']) if 'd3mIndex' == metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id))['name'] ] for res_id in resources] primary_key = flatten(primary_key) if len(primary_key) != 1: raise Exception('One primary key supported') cls._update_metadata_lookup(mdlu, 'primary_key', primary_key[0]) cls._update_metadata_lookup(mdlu, 'primary_resource_id', (primary_key[0][0], )) primary_resource_cols = metadata.query( (mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS)) for col_id in range(primary_resource_cols['dimension']['length']): cmd = metadata.query((mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_id)) col_name = cmd['name'] if 'semantic_types' in cmd: st = cmd['semantic_types'] if 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in st: cls._update_metadata_lookup( mdlu, 'targets', #(mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_name) (mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_id)) return mdlu if cls._valid_metadata_lookup(mdlu) else None
def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[OrderedDict]: outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in column_indices: column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = set() add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _can_use_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) semantic_type = column_metadata.get('semantic_types', None) if semantic_type is None: return False return 'http://schema.org/Integer' in semantic_type or 'http://schema.org/Float' in semantic_type
def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: # check to see if a given column is a file pointer that points to a csv file column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False semantic_types = column_metadata.get('semantic_types', []) media_types = column_metadata.get('media_types', []) return set(cls._semantic_types).issubset(semantic_types) and set(cls._media_types).issubset(media_types)
def _can_produce_column(cls, inputs_metadata: mbase.DataMetadata, column_index: int, hyperparams: UEncHyperparameter) -> bool: column_metadata = inputs_metadata.query((mbase.ALL_ELEMENTS, column_index)) semantic_types = column_metadata.get('semantic_types', []) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False if "https://metadata.datadrivendiscovery.org/types/Attribute" in semantic_types: return True return False
def _can_use_column(self, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) if column_metadata['structural_type'] != str: return False semantic_types = column_metadata.get('semantic_types', []) media_types = set(column_metadata.get('media_types', [])) if 'https://metadata.datadrivendiscovery.org/types/FileName' in semantic_types and media_types <= set(self._supported_media_types): return True return False
def _is_csv_file_reference(cls, inputs_metadata: metadata_base.DataMetadata, res_id: int, column_index: int) -> bool: # check to see if the column is a csv resource column_metadata = inputs_metadata.query((res_id, metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False semantic_types = column_metadata.get('semantic_types', []) media_types = column_metadata.get('media_types', []) semantic_types_set = set(semantic_types) _semantic_types_set = set(cls._semantic_types) return bool(semantic_types_set.intersection(_semantic_types_set)) and set(cls._media_types).issubset(media_types)
def _can_use_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: typing.Optional[int]) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) valid_struct_type = column_metadata.get('structural_type', None) in cls._structural_types semantic_types = column_metadata.get('semantic_types', []) valid_semantic_type = len( set(cls._semantic_types).intersection(semantic_types)) > 0 valid_role_type = len(set(cls._roles).intersection(semantic_types)) > 0 return valid_struct_type and valid_semantic_type
def _add_target_semantic_types(cls, metadata: metadata_base.DataMetadata, source: typing.Any, target_names: typing.List = None,) -> metadata_base.DataMetadata: for column_index in range(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']): metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/Target', source=source) metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', source=source) if target_names: metadata = metadata.update((metadata_base.ALL_ELEMENTS, column_index), { 'name': target_names[column_index], }, source=source) return metadata
def _can_use_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: typing.Optional[int]) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False semantic_types = column_metadata.get('semantic_types', []) if set(cls._semantic_types).issubset(semantic_types): return True return False
def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: int, column_index: int) -> bool: # check to see if a given column is a file pointer that points to a csv file column_metadata = inputs_metadata.query((res_id, metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata['structural_type'] != str: return False # check if a foreign key exists if column_metadata['foreign_key'] is None: return False ref_col_index = column_metadata['foreign_key']['column_index'] ref_res_id = column_metadata['foreign_key']['resource_id'] return cls._is_csv_file_reference(inputs_metadata, ref_res_id, ref_col_index)
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = [int, float, np.int64, np.float64] if column_metadata['structural_type'] not in accepted_structural_types: return False semantic_types = column_metadata.get('semantic_types', []) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False if "https://metadata.datadrivendiscovery.org/types/Attribute" in semantic_types: return True return False
def _is_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> bool: # check to see if a given column is a file pointer that points to a csv file column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) if not column_metadata or column_metadata["structural_type"] != str: return False # check if a foreign key exists if "foreign_key" not in column_metadata: return False ref_col_index = column_metadata["foreign_key"]["column_index"] ref_res_id = column_metadata["foreign_key"]["resource_id"] return cls._is_csv_file_reference(inputs_metadata, ref_res_id, ref_col_index)
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, np.integer, np.float64, str) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): print( column_index, "does not match the structural_type requirements in metadata. Skipping column" ) return False semantic_types = set(column_metadata.get('semantic_types', [])) # print("length sematic type",len(semantic_types)) # returing true for testing purposes for custom dataframes return True if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True print(semantic_types) return False