def _get_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: """ Output metadata of selected columns. Args: outputs_metadata: metadata_base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: d3m.metadata.base.DataMetadata """ outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = [] add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _update_metadata( self, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment ) -> metadata_base.DataMetadata: resource_metadata = dict(metadata.query((resource_id, ))) if 'structural_type' not in resource_metadata or not issubclass( resource_metadata['structural_type'], container.DataFrame): raise TypeError( "The Dataset resource is not a DataFrame, but \"{type}\".". format(type=resource_metadata.get('structural_type', None), )) resource_metadata.update( { 'schema': metadata_base.CONTAINER_SCHEMA_VERSION, }, ) new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = metadata.copy_to(new_metadata, (resource_id, )) # Resource is not anymore an entry point. new_metadata = new_metadata.remove_semantic_type( (), 'https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint') return new_metadata
def _get_target_columns_metadata( self, outputs_metadata: metadata_base.DataMetadata) -> List[OrderedDict]: outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = list(column_metadata.get('semantic_types', [])) if 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' not in semantic_types: semantic_types.append( 'https://metadata.datadrivendiscovery.org/types/PredictedTarget' ) semantic_types = [ semantic_type for semantic_type in semantic_types if semantic_type != 'https://metadata.datadrivendiscovery.org/types/TrueTarget' ] column_metadata['semantic_types'] = semantic_types target_columns_metadata.append(column_metadata) return target_columns_metadata
def _copy_elements_metadata( cls, source_metadata: metadata_base.DataMetadata, selector_prefix: metadata_base.Selector, selector: metadata_base.Selector, target_metadata: metadata_base.DataMetadata, *, source: typing.Any = None) -> metadata_base.DataMetadata: if source is None: source = cls elements = source_metadata.get_elements( list(selector_prefix) + list(selector)) for element in elements: new_selector = list(selector) + [element] metadata = source_metadata.query( list(selector_prefix) + new_selector) target_metadata = target_metadata.update(new_selector, metadata, source=source) target_metadata = cls._copy_elements_metadata(source_metadata, selector_prefix, new_selector, target_metadata, source=source) return target_metadata
def _update_metadata( cls, metadata: metadata_base.DataMetadata, resource_id: metadata_base.SelectorSegment, ) -> metadata_base.DataMetadata: resource_metadata = dict(metadata.query((resource_id, ))) if "structural_type" not in resource_metadata or not issubclass( resource_metadata["structural_type"], container.DataFrame): raise TypeError( 'The Dataset resource is not a DataFrame, but "{type}".'. format(type=resource_metadata.get("structural_type", None), )) resource_metadata.update( { "schema": metadata_base.CONTAINER_SCHEMA_VERSION, }, ) new_metadata = metadata_base.DataMetadata(resource_metadata) new_metadata = metadata.copy_to(new_metadata, (resource_id, )) # Resource is not anymore an entry point. new_metadata = new_metadata.remove_semantic_type( (), "https://metadata.datadrivendiscovery.org/types/DatasetEntryPoint") return new_metadata
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: self._fitted = True categorical_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData" ] ) all_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"] ) self._s_cols = container.List(set(all_attributes).intersection(categorical_attributes)) _logger.debug("%d of categorical attributes found." % (len(self._s_cols))) if len(self._s_cols) > 0: # temp_model = defaultdict(LabelEncoder) # self._training_data.iloc[:, self._s_cols].apply(lambda x: temp_model[x.name].fit(x)) # self._model = dict(temp_model) self._model = {} for col_index in self._s_cols: self._model[col_index] = self._training_data.iloc[:, col_index].dropna().unique() return CallResult(None, has_finished=True)
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], outputs_metadata: metadata_base.DataMetadata, hyperparams): outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in input_indices: column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") if column_name is None: column_name = "output_{}".format(column_index) column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = set() add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) # If outputs has more columns than index, add Attribute Type to all remaining if outputs_length > len(input_indices): for column_index in range(len(input_indices), outputs_length): column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_name = "output_{}".format(column_index) column_metadata["semantic_types"] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _get_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict( outputs_metadata.query_column(column_index)) # Update semantic types and prepare it for predicted targets. semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([ "https://metadata.datadrivendiscovery.org/types/TrueTarget", "https://metadata.datadrivendiscovery.org/types/SuggestedTarget", ]) add_semantic_types = set([ "https://metadata.datadrivendiscovery.org/types/PredictedTarget", ]) add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) target_columns_metadata.append(column_metadata) return target_columns_metadata
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: if self._fitted: return CallResult(None) if self._input_data is None: raise ValueError('Missing training(fitting) data.') # Look at attribute columns only # print('fit in', self._input_data.columns) data = self._input_data.copy() all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) # Remove columns with all empty values, structural type str numeric = DataMetadata.list_columns_with_semantic_types( data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) numeric = [x for x in numeric if x in all_attributes] self._empty_columns = [] _logger.debug(f'Numeric columns: {numeric}') for element in numeric: if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str: if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]: _logger.debug(f'Empty numeric str column: {element}') self._empty_columns.append(element) # Remove columns with all empty values, structural numeric is_empty = pd.isnull(data).sum(axis=0) == data.shape[0] for i in all_attributes: if is_empty.iloc[i] and i not in self._empty_columns: _logger.debug(f'Empty numeric str column: {element}') self._empty_columns.append(i) _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns])) data = container.DataFrame.remove_columns(data, self._empty_columns) categorical_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/OrdinalData", "https://metadata.datadrivendiscovery.org/types/CategoricalData"]) all_attributes = DataMetadata.list_columns_with_semantic_types(data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Attribute"]) self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes)) self._cat_columns = data.columns[self._cat_col_index].tolist() _logger.debug('Encoding columns: {}'.format(self._cat_columns)) mapping = {} for column_name in self._cat_columns: col = data[column_name] temp = self._trim_features(col, self.hyperparams['n_limit']) if temp: mapping[temp[0]] = temp[1] self._mapping = mapping self._fitted = True return CallResult(None, has_finished=True)
def __get_fitted(self): attribute = DataMetadata.list_columns_with_semantic_types( self._train_x.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) # Mean for numerical columns self._numeric_columns = DataMetadata.list_columns_with_semantic_types( self._train_x.metadata, ['http://schema.org/Integer', 'http://schema.org/Float']) self._numeric_columns = [ x for x in self._numeric_columns if x in attribute ] _logger.debug('numeric columns %s', str(self._numeric_columns)) # Convert selected columns to_numeric, then compute column mean, then convert to_dict self.mean_values = self._train_x.iloc[:, self._numeric_columns].apply( lambda col: pd.to_numeric(col, errors='coerce')).mean( axis=0).to_dict() for name in self.mean_values.keys(): if pd.isnull(self.mean_values[name]): self.mean_values[name] = 0.0 # Mode for categorical columns self._categoric_columns = DataMetadata.list_columns_with_semantic_types( self._train_x.metadata, [ 'https://metadata.datadrivendiscovery.org/types/CategoricalData', 'http://schema.org/Boolean' ]) self._categoric_columns = [ x for x in self._categoric_columns if x in attribute ] _logger.debug('categorical columns %s', str(self._categoric_columns)) mode_values = self._train_x.iloc[:, self._categoric_columns].mode( axis=0).iloc[0].to_dict() for name in mode_values.keys(): if pd.isnull(mode_values[name]): # mode is nan rest = self._train_x[name].dropna() if rest.shape[0] == 0: # every value is nan mode = 0 else: mode = rest.mode().iloc[0] mode_values[name] = mode self.mean_values.update(mode_values) if self._verbose: import pprint print('mean imputation:') pprint.pprint(self.mean_values) _logger.debug('Mean values:') for name, value in self.mean_values.items(): _logger.debug(' %s %s', name, str(value))
def _get_base_path(self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> str: # get the base uri from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_col_index = column_metadata['foreign_key']['column_index'] ref_res_id = column_metadata['foreign_key']['resource_id'] return inputs_metadata.query((ref_res_id, metadata_base.ALL_ELEMENTS, ref_col_index))['location_base_uris'][0]
def _add_target_semantic_types(cls, metadata: metadata_base.DataMetadata, source: typing.Any, target_names: typing.List = None,) -> metadata_base.DataMetadata: for column_index in range(metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']): metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/Target', source=source) metadata = metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, column_index), 'https://metadata.datadrivendiscovery.org/types/PredictedTarget', source=source) if target_names: metadata = metadata.update((metadata_base.ALL_ELEMENTS, column_index), { 'name': target_names[column_index], }, source=source) return metadata
def _produce_column_metadata( self, inputs_metadata: metadata_base.DataMetadata, column_index: int, read_files: typing.Sequence[typing.Any], ) -> metadata_base.DataMetadata: column_metadata = inputs_metadata.select_columns([column_index]) column_metadata = column_metadata.update_column(0, { 'structural_type': self._file_structural_type, # Clear metadata useful for filename columns. 'location_base_uris': metadata_base.NO_VALUE, 'media_types': metadata_base.NO_VALUE, }) # It is not a filename anymore. column_metadata = column_metadata.remove_semantic_type((metadata_base.ALL_ELEMENTS, 0), 'https://metadata.datadrivendiscovery.org/types/FileName') # At least one semantic type from listed semantic types should be set. semantic_types = column_metadata.query_column(0).get('semantic_types', []) if not set(semantic_types) & set(self._file_semantic_types): # Add the first one. column_metadata = column_metadata.add_semantic_type((metadata_base.ALL_ELEMENTS, 0), self._file_semantic_types[0]) for row_index, file in enumerate(read_files): # Copy metadata only if we have a container type. if isinstance(file, types.Container): column_metadata = file.metadata.copy_to(column_metadata, (), (row_index, 0)) column_metadata = column_metadata.compact(['name', 'structural_type', 'media_types', 'location_base_uris', 'semantic_types']) return column_metadata
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: numerical_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=["http://schema.org/Float", "http://schema.org/Integer"]) all_attributes = DataMetadata.list_columns_with_semantic_types( self=self._training_data.metadata, semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]) self._s_cols = list(set(all_attributes).intersection(numerical_attributes)) # print(" %d columns scaled" % (len(self._s_cols))) if len(self._s_cols) > 0: self._model.fit(self._training_data.iloc[:, self._s_cols]) self._fitted = True else: self._fitted = False return CallResult(None, has_finished=True, iterations_done=1)
def _reassign_boundaries(self, inputs_metadata: metadata_base.DataMetadata, columns: typing.List[int]) -> metadata_base.DataMetadata: """ Moves metadata about boundaries from the filename column to image object column. """ outputs_metadata = inputs_metadata columns_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] for column_index in range(columns_length): column_metadata = outputs_metadata.query_column(column_index) if 'boundary_for' not in column_metadata: continue # TODO: Support also "column_name" boundary metadata. if 'column_index' not in column_metadata['boundary_for']: continue try: i = columns.index(column_metadata['boundary_for']['column_index']) except ValueError: continue outputs_metadata = outputs_metadata.update_column(column_index, { 'boundary_for': { # We know that "columns" were appended at the end. 'column_index': columns_length - len(columns) + i, } }) return outputs_metadata
def set_training_data(self, *, inputs: Input) -> None: """ Sets training data of this primitive. Parameters ---------- inputs : Input The inputs. """ attribute = DataMetadata.list_columns_with_semantic_types( inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/Attribute']) nan_sum = 0 for col in attribute: if str(inputs.dtypes[inputs.columns[col]]) != "object": nan_sum += inputs.iloc[:, col].isnull().sum() else: for i in range(inputs.shape[0]): if inputs.iloc[i, col] == "" or pd.isnull( inputs.iloc[i, col]): nan_sum += 1 if nan_sum == 0: # no missing value exists if self._verbose: _logger.info('no missing value in train dataset') self._train_x = inputs self._is_fitted = False
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (str, ) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False
def _add_column( self, main_resource_id: str, data: pandas.DataFrame, metadata: metadata_base.DataMetadata, column_data: pandas.DataFrame, column_metadata: typing.Dict ) -> typing.Tuple[pandas.DataFrame, metadata_base.DataMetadata]: assert column_data.shape[1] == 1 if data is None: data = column_data else: #import pdb #pdb.set_trace() #data = data.reset_index().drop(columns=['index']) column_data = column_data.set_index(data.index) #column_data = column_data.reset_index().drop(columns=['index']) data = pandas.concat([data, column_data], axis=1) ''' data = data.reset_index().drop(columns=['index']) selected_data_key = column_data.columns for each_key in selected_data_key: data[each_key] = column_data[each_key] ''' metadata = metadata.update( (main_resource_id, metadata_base.ALL_ELEMENTS, data.shape[1] - 1), column_metadata, source=self) return data, metadata
def _add_target_columns_metadata( cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): """ Add target columns metadata Args: outputs_metadata: metadata.base.DataMetadata hyperparams: d3m.metadata.hyperparams.Hyperparams Returns: List[OrderedDict] """ outputs_length = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, ))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): # column_name = "output_{}".format(column_index) column_metadata = OrderedDict() semantic_types = set() semantic_types.add(hyperparams["return_semantic_type"]) column_metadata['semantic_types'] = list(semantic_types) # column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _can_use_column( self, inputs_metadata: metadata_base.DataMetadata, column_index: int ) -> bool: """ originally from from d3m.primitives.schema_discovery.profiler.Common """ column_metadata = inputs_metadata.query_column(column_index) semantic_types = column_metadata.get("semantic_types", []) # We detect only on columns which have no semantic types or where it is explicitly set as unknown. if ( not semantic_types or "https://metadata.datadrivendiscovery.org/types/UnknownType" in semantic_types ): return True # A special case to handle setting "https://metadata.datadrivendiscovery.org/types/TrueTarget". if ( "https://metadata.datadrivendiscovery.org/types/SuggestedTarget" in semantic_types ): return True return False
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in range(outputs_length): column_metadata = OrderedDict() semantic_types = [] semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") if column_name is None: column_name = "output_{}".format(column_index) column_metadata["semantic_types"] = semantic_types column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: """ Output whether a column can be processed. Args: inputs_metadata: d3m.metadata.base.DataMetadata column_index: int Returns: bool """ column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) accepted_structural_types = (int, float, numpy.integer, numpy.float64) accepted_semantic_types = set() accepted_semantic_types.add( "https://metadata.datadrivendiscovery.org/types/Attribute") if not issubclass(column_metadata['structural_type'], accepted_structural_types): return False semantic_types = set(column_metadata.get('semantic_types', [])) return True if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False
def _split_column(self, inputs): """ Inner function to sample part of the column of the input dataset """ input_dataset_shape = inputs[self._main_resource_id].shape # find target column, we should not split these column target_column = DataMetadata.list_columns_with_semantic_types( self._training_inputs.metadata, ['https://metadata.datadrivendiscovery.org/types/TrueTarget'], at=(self._main_resource_id, )) if not target_column: self._logger.warn("No target column found from the input dataset.") index_column = DataMetadata.get_index_columns( self._training_inputs.metadata, at=(self._main_resource_id, )) if not index_column: self._logger.warn("No index column found from the input dataset.") outputs = copy.copy(inputs) if self._status is Status.TRAIN: # check again on the amount of the attributes column only # we only need to sample when attribute column numbers are larger than threshould attribute_column_length = (input_dataset_shape[1] - len(index_column) - len(target_column)) if attribute_column_length > self._threshold_column_length: attribute_column = set(range(input_dataset_shape[1])) for each_target_column in target_column: attribute_column.remove(each_target_column) for each_index_column in index_column: attribute_column.remove(each_index_column) # generate the remained column index randomly and sort it self._column_remained = random.sample( attribute_column, self._threshold_column_length) self._column_remained.extend(target_column) self._column_remained.extend(index_column) self._column_remained.sort() if len(self._column_remained) > 0: # Just to make sure. outputs.metadata = copy.deepcopy(inputs.metadata) outputs[self._main_resource_id] = inputs[ self._main_resource_id].iloc[:, self._column_remained] outputs.metadata = self._select_columns_metadata( outputs.metadata, self._main_resource_id, self._column_remained) return outputs
def _find_csv_file_column(cls, inputs_metadata: metadata_base.DataMetadata, res_id: str) -> typing.Optional[int]: indices = inputs_metadata.list_columns_with_semantic_types( cls._semantic_types, at=(res_id, )) for i in indices: if cls._is_csv_file_column(inputs_metadata, res_id, i): return i return None
def _get_date_cols(data): dates = DataMetadata.list_columns_with_semantic_types( data.metadata, semantic_types=[ "https://metadata.datadrivendiscovery.org/types/Time" ]) return dates
def _can_use_outputs_column(self, outputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: column_metadata = outputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) return 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in column_metadata.get( 'semantic_types', [])
def _get_ref_resource(self, inputs_metadata: metadata_base.DataMetadata, res_id: str, column_index: int) -> str: # get the referenced resource from the referenced column column_metadata = inputs_metadata.query( (res_id, metadata_base.ALL_ELEMENTS, column_index)) ref_res_id = column_metadata['foreign_key']['resource_id'] return ref_res_id
def _parse_metadata(cls, *, metadata: metadata_module.DataMetadata): flatten = lambda l: [item for sublist in l for item in sublist] mdlu = cls._init_metadata_lookup() num_res = metadata.query(())['dimension']['length'] resources = [str(x) for x in range(num_res - 1)] resources.append('learningData') #primary_key = [ [ (res_id, metadata_module.ALL_ELEMENTS, col_id) for x in range(metadata.query((res_id, metadata_module.ALL_ELEMENTS))['dimension']['length']) # if 'semantic_types' in metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id)) and primary_sem_type in metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id))['semantic_types'] ] # for res_id in resources ] primary_key = [[ (res_id, metadata_module.ALL_ELEMENTS, col_id) for col_id in range( metadata.query(( res_id, metadata_module.ALL_ELEMENTS))['dimension']['length']) if 'd3mIndex' == metadata.query((res_id, metadata_module.ALL_ELEMENTS, col_id))['name'] ] for res_id in resources] primary_key = flatten(primary_key) if len(primary_key) != 1: raise Exception('One primary key supported') cls._update_metadata_lookup(mdlu, 'primary_key', primary_key[0]) cls._update_metadata_lookup(mdlu, 'primary_resource_id', (primary_key[0][0], )) primary_resource_cols = metadata.query( (mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS)) for col_id in range(primary_resource_cols['dimension']['length']): cmd = metadata.query((mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_id)) col_name = cmd['name'] if 'semantic_types' in cmd: st = cmd['semantic_types'] if 'https://metadata.datadrivendiscovery.org/types/TrueTarget' in st: cls._update_metadata_lookup( mdlu, 'targets', #(mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_name) (mdlu['primary_resource_id']['selector'][0], metadata_module.ALL_ELEMENTS, col_id)) return mdlu if cls._valid_metadata_lookup(mdlu) else None
def _can_use_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int) -> bool: column_metadata = inputs_metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) semantic_type = column_metadata.get('semantic_types', None) if semantic_type is None: return False return 'http://schema.org/Integer' in semantic_type or 'http://schema.org/Float' in semantic_type
def _copy_columns_metadata(cls, inputs_metadata: metadata_base.DataMetadata, column_indices, hyperparams) -> List[OrderedDict]: outputs_length = inputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] target_columns_metadata: List[OrderedDict] = [] for column_index in column_indices: column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) semantic_types_to_remove = set([]) add_semantic_types = set() add_semantic_types.add(hyperparams["return_semantic_type"]) semantic_types = semantic_types - semantic_types_to_remove semantic_types = semantic_types.union(add_semantic_types) column_metadata['semantic_types'] = list(semantic_types) column_metadata["name"] = str(column_name) target_columns_metadata.append(column_metadata) return target_columns_metadata