def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): if not hyperparams['use_semantic_types']: return data, list(data.columns), list(range(len(data.columns))) metadata = data.metadata def can_produce_column(column_index: int) -> bool: accepted_semantic_types = set() accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) semantic_types = set(column_metadata.get('semantic_types', [])) if len(semantic_types) == 0: cls.logger.warning("No semantic types found in column metadata") return False # Making sure all accepted_semantic_types are available in semantic_types if len(accepted_semantic_types - semantic_types) == 0: return True return False target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, use_columns=hyperparams[ 'use_outputs_columns'], exclude_columns= hyperparams[ 'exclude_outputs_columns'], can_use_column=can_produce_column) targets = [] if target_column_indices: targets = data.select_columns(target_column_indices) target_column_names = [] for idx in target_column_indices: target_column_names.append(data.columns[idx]) return targets, target_column_names, target_column_indices
def produce( self, *, inputs: container.DataFrame, timeout: float = None, iterations: int = None, ) -> base.CallResult[container.DataFrame]: df = inputs.select_columns( inputs.metadata.list_columns_with_semantic_types( ("http://schema.org/Float",) ) ) df = df.to_numpy().reshape( df.shape[0], 2048, self.hyperparams["height"], self.hyperparams["width"] ) all_img_features = [] batch_size = self.hyperparams["batch_size"] spatial_a = 2.0 spatial_b = 2.0 for i in range(math.ceil(df.shape[0] / batch_size)): features = df[i * batch_size : (i + 1) * batch_size] spatial_weight = features.sum(axis=1, keepdims=True) z = (spatial_weight ** spatial_a).sum(axis=(2, 3), keepdims=True) z = z ** (1.0 / spatial_a) spatial_weight = (spatial_weight / z) ** (1.0 / spatial_b) _, c, w, h = features.shape nonzeros = (features != 0).astype(float).sum(axis=(2, 3)) / 1.0 / ( w * h ) + 1e-6 channel_weight = np.log(nonzeros.sum(axis=1, keepdims=True) / nonzeros) features = features * spatial_weight features = features.sum(axis=(2, 3)) features = features * channel_weight all_img_features.append(features) all_img_features = np.vstack(all_img_features) col_names = [f"feat_{i}" for i in range(0, all_img_features.shape[1])] feature_df = pd.DataFrame(all_img_features, columns=col_names) outputs = container.DataFrame(feature_df.head(1), generate_metadata=True) outputs.metadata = outputs.metadata.update( (metadata_base.ALL_ELEMENTS,), {"dimension": {"length": feature_df.shape[0]}}, ) outputs = outputs.append(feature_df.iloc[1:]) for idx in range(outputs.shape[1]): outputs.metadata = outputs.metadata.add_semantic_type( (metadata_base.ALL_ELEMENTS, idx), "http://schema.org/Float" ) return base.CallResult(outputs)
def combine_columns( inputs: container.DataFrame, column_indices: typing.Sequence[int], columns_list: typing.Sequence[container.DataFrame], *, return_result: str, add_index_columns: bool, ) -> container.DataFrame: """ Method which appends existing columns, replaces them, or creates new result from them, based on ``return_result`` argument, which can be ``append``, ``replace``, or ``new``. ``add_index_columns`` controls if when creating a new result, primary index columns should be added if they are not already among columns. ``inputs`` is a DataFrame for which we are appending on replacing columns, or if we are creating new result, from where a primary index column can be taken. ``column_indices`` controls which columns in ``inputs`` were used to create ``columns_list``, and which columns should be replaced when replacing them. ``columns_list`` is a list of DataFrames representing all together new columns. The reason it is a list is to make it easier to operate per-column when preparing ``columns_list`` and not have to concat them all together unnecessarily. Top-level metadata in ``columns_list`` is ignored, except when creating new result. In that case top-level metadata from the first element in the list is used. When ``column_indices`` columns are being replaced with ``columns_list``, existing metadata in ``column_indices`` columns is not preserved but replaced with metadata in ``columns_list``. Ideally, metadata for ``columns_list`` has been constructed by copying source metadata from ``column_indices`` columns and modifying it as necessary to adapt it to new columns. But ``columns_list`` also can have completely new metadata, if this is more reasonable, but it should be understood that in this case when replacing ``column_indices`` columns, any custom additional metadata on those columns will be lost. ``column_indices`` and ``columns_list`` do not have to match in number of columns. Columns are first replaced in order for matching indices and columns. If then there are more ``column_indices`` than ``columns_list``, additional ``column_indices`` columns are removed. If there are more ``columns_list`` than ``column_indices`` columns, then additional ``columns_list`` are inserted after the last replaced column. If ``column_indices`` is empty, then the replacing behavior is equivalent to appending. """ if return_result == 'append': outputs = inputs for columns in columns_list: outputs = outputs.append_columns(columns) elif return_result == 'replace': if not column_indices: return combine_columns(inputs, column_indices, columns_list, return_result='append', add_index_columns=add_index_columns) # Compute the difference in "columns" to_be_added = list( numpy.setdiff1d(numpy.arange(len(inputs.columns)), column_indices)) columns_replaced = 0 if len(to_be_added) < len(column_indices): # More efficient to concatenate than replace one-by-one outputs = pandas.concat(columns_list, axis=1) outputs = container.DataFrame(data=outputs, generate_metadata=False) indices = range(columns_list[0].shape[1]) outputs.metadata = inputs.metadata.select_columns( columns=list(indices)) c = 0 for columns in columns_list: columns_length = columns.shape[1] if c == 0: outputs.metadata = outputs.metadata.replace_columns( columns.metadata, list(indices)) else: outputs.metadata = outputs.metadata.append_columns( columns.metadata) c += 1 for col in to_be_added: insert_index = col.item() if insert_index > outputs.shape[1]: insert_index = outputs.shape[1] outputs = outputs.insert_columns( inputs.select_columns([col.item()]), insert_index) outputs.metadata = outputs.metadata.compact(['structural_type']) else: # We copy here and disable copying inside "replace_columns" to copy only once. # We have to copy because "replace_columns" is modifying data in-place. outputs = copy.copy(inputs) for columns in columns_list: columns_length = columns.shape[1] if columns_replaced < len(column_indices): # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns # listed in the slice will be replaced and others appended after the last replaced column. outputs = outputs.replace_columns( columns, column_indices[columns_replaced:columns_replaced + columns_length], copy=False) else: # We insert the rest of columns after the last columns we replaced. We know that "column_indices" # is non-empty and that the last item of "column_indices" points ot the last column we replaced # for those listed in "column_indices". We replaced more columns though, so we have to add the # difference, and then add 1 to insert after the last column. outputs = outputs.insert_columns( columns, column_indices[-1] + (columns_replaced - len(column_indices)) + 1) columns_replaced += columns_length if columns_replaced < len(column_indices): outputs = outputs.remove_columns( column_indices[columns_replaced:len(column_indices)]) elif return_result == 'new': if not any(columns.shape[1] for columns in columns_list): raise ValueError("No columns produced.") outputs = columns_list[0] for columns in columns_list[1:]: outputs = outputs.append_columns(columns) if add_index_columns: inputs_index_columns = inputs.metadata.get_index_columns() outputs_index_columns = outputs.metadata.get_index_columns() if inputs_index_columns and not outputs_index_columns: # Add index columns at the beginning. outputs = inputs.select_columns( inputs_index_columns).append_columns( outputs, use_right_metadata=True) else: raise exceptions.InvalidArgumentValueError( "\"return_result\" has an invalid value: {return_result}".format( return_result=return_result)) return outputs