def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
        if not hyperparams['use_semantic_types']:
            return data, list(data.columns), list(range(len(data.columns)))

        metadata = data.metadata

        def can_produce_column(column_index: int) -> bool:
            accepted_semantic_types = set()
            accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
            column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
            semantic_types = set(column_metadata.get('semantic_types', []))
            if len(semantic_types) == 0:
                cls.logger.warning("No semantic types found in column metadata")
                return False
            # Making sure all accepted_semantic_types are available in semantic_types
            if len(accepted_semantic_types - semantic_types) == 0:
                return True
            return False

        target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
                                                                                               use_columns=hyperparams[
                                                                                                   'use_outputs_columns'],
                                                                                               exclude_columns=
                                                                                               hyperparams[
                                                                                                   'exclude_outputs_columns'],
                                                                                               can_use_column=can_produce_column)
        targets = []
        if target_column_indices:
            targets = data.select_columns(target_column_indices)
        target_column_names = []
        for idx in target_column_indices:
            target_column_names.append(data.columns[idx])
        return targets, target_column_names, target_column_indices
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        df = inputs.select_columns(
            inputs.metadata.list_columns_with_semantic_types(
                ("http://schema.org/Float",)
            )
        )
        df = df.to_numpy().reshape(
            df.shape[0], 2048, self.hyperparams["height"], self.hyperparams["width"]
        )
        all_img_features = []
        batch_size = self.hyperparams["batch_size"]
        spatial_a = 2.0
        spatial_b = 2.0
        for i in range(math.ceil(df.shape[0] / batch_size)):
            features = df[i * batch_size : (i + 1) * batch_size]
            spatial_weight = features.sum(axis=1, keepdims=True)
            z = (spatial_weight ** spatial_a).sum(axis=(2, 3), keepdims=True)
            z = z ** (1.0 / spatial_a)
            spatial_weight = (spatial_weight / z) ** (1.0 / spatial_b)

            _, c, w, h = features.shape
            nonzeros = (features != 0).astype(float).sum(axis=(2, 3)) / 1.0 / (
                w * h
            ) + 1e-6
            channel_weight = np.log(nonzeros.sum(axis=1, keepdims=True) / nonzeros)

            features = features * spatial_weight
            features = features.sum(axis=(2, 3))
            features = features * channel_weight
            all_img_features.append(features)
        all_img_features = np.vstack(all_img_features)
        col_names = [f"feat_{i}" for i in range(0, all_img_features.shape[1])]
        feature_df = pd.DataFrame(all_img_features, columns=col_names)

        outputs = container.DataFrame(feature_df.head(1), generate_metadata=True)
        outputs.metadata = outputs.metadata.update(
            (metadata_base.ALL_ELEMENTS,),
            {"dimension": {"length": feature_df.shape[0]}},
        )
        outputs = outputs.append(feature_df.iloc[1:])
        for idx in range(outputs.shape[1]):
            outputs.metadata = outputs.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, idx), "http://schema.org/Float"
            )

        return base.CallResult(outputs)
Example #3
0
def combine_columns(
    inputs: container.DataFrame,
    column_indices: typing.Sequence[int],
    columns_list: typing.Sequence[container.DataFrame],
    *,
    return_result: str,
    add_index_columns: bool,
) -> container.DataFrame:
    """
    Method which appends existing columns, replaces them, or creates new result from them, based on
    ``return_result`` argument, which can be ``append``, ``replace``, or ``new``.

    ``add_index_columns`` controls if when creating a new result, primary index columns should be added
    if they are not already among columns.

    ``inputs`` is a DataFrame for which we are appending on replacing columns, or if we are creating new result,
    from where a primary index column can be taken.

    ``column_indices`` controls which columns in ``inputs`` were used to create ``columns_list``,
    and which columns should be replaced when replacing them.

    ``columns_list`` is a list of DataFrames representing all together new columns. The reason it is a list is
    to make it easier to operate per-column when preparing ``columns_list`` and not have to concat them all
    together unnecessarily.

    Top-level metadata in ``columns_list`` is ignored, except when creating new result.
    In that case top-level metadata from the first element in the list is used.

    When ``column_indices`` columns are being replaced with ``columns_list``, existing metadata in ``column_indices``
    columns is not preserved but replaced with metadata in ``columns_list``. Ideally, metadata for ``columns_list``
    has been constructed by copying source metadata from ``column_indices`` columns and modifying it as
    necessary to adapt it to new columns. But ``columns_list`` also can have completely new metadata, if this
    is more reasonable, but it should be understood that in this case when replacing ``column_indices``
    columns, any custom additional metadata on those columns will be lost.

    ``column_indices`` and ``columns_list`` do not have to match in number of columns. Columns are first
    replaced in order for matching indices and columns. If then there are more ``column_indices`` than
    ``columns_list``, additional ``column_indices`` columns are removed. If there are more ``columns_list`` than
    ``column_indices`` columns, then additional ``columns_list`` are inserted after the last replaced column.

    If ``column_indices`` is empty, then the replacing behavior is equivalent to appending.
    """

    if return_result == 'append':
        outputs = inputs
        for columns in columns_list:
            outputs = outputs.append_columns(columns)

    elif return_result == 'replace':
        if not column_indices:
            return combine_columns(inputs,
                                   column_indices,
                                   columns_list,
                                   return_result='append',
                                   add_index_columns=add_index_columns)

        # Compute the difference in "columns"
        to_be_added = list(
            numpy.setdiff1d(numpy.arange(len(inputs.columns)), column_indices))
        columns_replaced = 0
        if len(to_be_added) < len(column_indices):
            # More efficient to concatenate than replace one-by-one
            outputs = pandas.concat(columns_list, axis=1)
            outputs = container.DataFrame(data=outputs,
                                          generate_metadata=False)
            indices = range(columns_list[0].shape[1])
            outputs.metadata = inputs.metadata.select_columns(
                columns=list(indices))

            c = 0
            for columns in columns_list:
                columns_length = columns.shape[1]
                if c == 0:
                    outputs.metadata = outputs.metadata.replace_columns(
                        columns.metadata, list(indices))
                else:
                    outputs.metadata = outputs.metadata.append_columns(
                        columns.metadata)
                c += 1

            for col in to_be_added:
                insert_index = col.item()
                if insert_index > outputs.shape[1]:
                    insert_index = outputs.shape[1]
                outputs = outputs.insert_columns(
                    inputs.select_columns([col.item()]), insert_index)
            outputs.metadata = outputs.metadata.compact(['structural_type'])
        else:
            # We copy here and disable copying inside "replace_columns" to copy only once.
            # We have to copy because "replace_columns" is modifying data in-place.
            outputs = copy.copy(inputs)
            for columns in columns_list:
                columns_length = columns.shape[1]
                if columns_replaced < len(column_indices):
                    # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns
                    # listed in the slice will be replaced and others appended after the last replaced column.
                    outputs = outputs.replace_columns(
                        columns,
                        column_indices[columns_replaced:columns_replaced +
                                       columns_length],
                        copy=False)
                else:
                    # We insert the rest of columns after the last columns we replaced. We know that "column_indices"
                    # is non-empty and that the last item of "column_indices" points ot the last column we replaced
                    # for those listed in "column_indices". We replaced more columns though, so we have to add the
                    # difference, and then add 1 to insert after the last column.
                    outputs = outputs.insert_columns(
                        columns, column_indices[-1] +
                        (columns_replaced - len(column_indices)) + 1)
                columns_replaced += columns_length

            if columns_replaced < len(column_indices):
                outputs = outputs.remove_columns(
                    column_indices[columns_replaced:len(column_indices)])
    elif return_result == 'new':
        if not any(columns.shape[1] for columns in columns_list):
            raise ValueError("No columns produced.")

        outputs = columns_list[0]
        for columns in columns_list[1:]:
            outputs = outputs.append_columns(columns)

        if add_index_columns:
            inputs_index_columns = inputs.metadata.get_index_columns()
            outputs_index_columns = outputs.metadata.get_index_columns()

            if inputs_index_columns and not outputs_index_columns:
                # Add index columns at the beginning.
                outputs = inputs.select_columns(
                    inputs_index_columns).append_columns(
                        outputs, use_right_metadata=True)

    else:
        raise exceptions.InvalidArgumentValueError(
            "\"return_result\" has an invalid value: {return_result}".format(
                return_result=return_result))

    return outputs