Esempio n. 1
0
def get_primitive(primitive_path: str) -> typing.Type[base.PrimitiveBase]:
    """
    Loads (if not already) a primitive class and returns it.

    Parameters
    ----------
    primitive_path:
        A Python path under ``d3m.primitives`` namespace of a primitive.

    Returns
    -------
    A primitive class.
    """

    if not primitive_path:
        raise exceptions.InvalidArgumentValueError(
            "Primitive path is required.")

    if not primitive_path.startswith('d3m.primitives.'):
        raise exceptions.InvalidArgumentValueError(
            "Primitive path does not start with \"d3m.primitives\".")

    path, name = primitive_path.rsplit('.', 1)

    module = importlib.import_module(path)

    return getattr(module, name)
Esempio n. 2
0
    def _get_predictions(self, inputs: Inputs) -> pandas.DataFrame:
        """
        It requires that predictions already have the right structure (one ``d3mIndex``
        column, at most one ``confidence`` column, at most one ``rank`` column,
        no duplicate column names).

        We return a regular Pandas DataFrame with column names matching those in the metadata.
        We convert all columns to strings to match what would be loaded from ``predictions.csv`` file.
        Predictions DataFrame should already have float vectors encoded as strings.
        """

        dataframe = self._to_dataframe(inputs)

        if metrics.INDEX_COLUMN not in dataframe.columns:
            raise exceptions.InvalidArgumentValueError("No primary index column.")

        if d3m_utils.has_duplicates(dataframe.columns):
            duplicate_names = list(dataframe.columns)
            for name in set(dataframe.columns):
                duplicate_names.remove(name)
            raise exceptions.InvalidArgumentValueError(
                "Predicted target columns have duplicate names: {duplicate_names}".format(
                    duplicate_names=sorted(set(duplicate_names)),
                ),
            )

        return dataframe
Esempio n. 3
0
    def __init__(self, dimension_values: typing.Dict[DimensionName, typing.List[T]], *,
                 dimension_ordering: typing.List[DimensionName] = None, value_weights: typing.Dict[DimensionName, typing.List[float]] = None) -> None:

        if dimension_ordering is not None and set(dimension_values.keys()) == set(dimension_ordering):
            raise exceptions.InvalidArgumentValueError(
                'The keys of dimension_values and dimesion_ordering must be the same')

        if value_weights is not None and not set(dimension_values.keys()) == set(value_weights.keys()):
            raise exceptions.InvalidArgumentValueError(
                'The set of keys of dimension_values and value_weights must be the same')

            for key in dimension_values.keys():
                if not len(dimension_values[key]) == len(value_weights[key]):
                    raise exceptions.InvalidArgumentValueError(
                        'The length of dimension_values[{}] and values_weights[{}] must be the same'.format(key, key))

        if value_weights is None:
            value_weights = {}
            for key in dimension_values.keys():
                value_weights[key] = [1.0] * len(dimension_values[key])

        if dimension_ordering is None:
            dimension_ordering = list(dimension_values.keys())

        self._dimension_values: typing.Dict[DimensionName, typing.List[T]] = dimension_values
        self._value_weights: typing.Dict[DimensionName, typing.List[float]] = value_weights
        self._dimension_ordering = dimension_ordering
Esempio n. 4
0
    def insert_columns(self: D, columns: 'DataFrame',
                       at_column_index: int) -> D:
        """
        Inserts all columns from ``columns`` before ``at_column_index`` column in this DataFrame,
        pushing all existing columns to the right.

        E.g., ``at_column_index == 0`` means inserting ``columns`` at the beginning of this DataFrame.

        Top-level metadata of ``columns`` is ignored.
        """

        columns_length = self.shape[1]

        if at_column_index < 0:
            raise exceptions.InvalidArgumentValueError(
                "\"at_column_index\" is smaller than 0.")
        if at_column_index > columns_length:
            raise exceptions.InvalidArgumentValueError(
                "\"at_column_index\" is larger than the range of existing columns."
            )

        if at_column_index == 0:
            return columns.append_columns(self, use_right_metadata=True)

        if at_column_index == columns_length:
            return self.append_columns(columns)

        # TODO: This could probably be optimized without all the slicing and joining.

        before = self.select_columns(list(range(0, at_column_index)))
        after = self.select_columns(
            list(range(at_column_index, columns_length)))

        return before.append_columns(columns).append_columns(after)
Esempio n. 5
0
    def __init__(self,
                 other: typing.Dict[str, typing.Any] = None,
                 **values: typing.Any) -> None:
        if other is None:
            other = {}

        values = dict(other, **values)

        params_keys = set(self.__params_items__.keys())  # type: ignore
        values_keys = set(values.keys())

        missing = params_keys - values_keys
        if len(missing):
            raise exceptions.InvalidArgumentValueError(
                "Not all parameters are specified: {missing}".format(
                    missing=missing))

        extra = values_keys - params_keys
        if len(extra):
            raise exceptions.InvalidArgumentValueError(
                "Additional parameters are specified: {extra}".format(
                    extra=extra))

        for name, value in values.items():
            value_type = self.__params_items__[name]  # type: ignore
            if not utils.is_instance(value, value_type):
                raise exceptions.InvalidArgumentTypeError(
                    "Value '{value}' for parameter '{name}' is not an instance of the type: {value_type}"
                    .format(value=value, name=name, value_type=value_type))

        super().__init__(values)
    def produce(self,
                *,
                inputs: container.Dataset,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:

        main_resource_index = self.hyperparams['main_resource_index']
        if main_resource_index is None:
            raise exceptions.InvalidArgumentValueError(
                'no main resource specified')

        file_index = self.hyperparams['file_col_index']
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata,
                                            main_resource_index, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) +
                    ' from does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from contains csv file names')

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_index,
                                        file_index)
        output_data = []
        timeseries_dataframe = pd.DataFrame()
        for idx, tRow in inputs[main_resource_index].iterrows():
            # read the timeseries data
            csv_path = os.path.join(base_path, tRow[file_index])
            timeseries_row = pd.read_csv(csv_path)

            # add the timeseries id
            tRow = tRow.append(pd.Series({'series_id': int(idx)}))

            # combine the timeseries data with the value row
            output_data.extend([
                pd.concat([tRow, vRow])
                for vIdx, vRow in timeseries_row.iterrows()
            ])

        # add the timeseries index
        timeseries_dataframe = timeseries_dataframe.append(output_data,
                                                           ignore_index=True)

        # join the metadata from the 2 data resources
        timeseries_dataframe = container.DataFrame(timeseries_dataframe)

        # wrap as a D3M container
        #return base.CallResult(container.Dataset({'0': timeseries_dataframe}, metadata))
        return base.CallResult(
            container.Dataset({'0': timeseries_dataframe},
                              generate_metadata=True))
    def produce(self,
                *,
                inputs: container.List,
                timeout: float = None,
                iterations: int = None) -> base.CallResult[container.Dataset]:
        # build the list of dataframes from the list of inputs
        dataframes = []
        metadata = None
        for input in inputs:
            if isinstance(input, container.DataFrame):
                dataframes.append(input)
            try:
                _, main_dr = d3m_base_utils.get_tabular_resource(input, None)
                dataframes.append(main_dr)
                metadata = input.metadata
            except ValueError as error:
                raise exceptions.InvalidArgumentValueError(
                    "Failure to find tabular resource in dataset") from error

        if self.hyperparams["column_overlap"] == "exact":
            columns_to_handle = dataframes[0].columns
            if np.sum(
                    np.array([
                        np.all(df.columns == columns_to_handle)
                        for df in dataframes
                    ])) != len(dataframes):
                raise exceptions.InvalidArgumentValueError(
                    "Dataframes don't have same columns, cannot exact concat")
            concated = pd.concat(dataframes, ignore_index=True)
        elif self.hyperparams["column_overlap"] == "union":
            concated = pd.concat(dataframes, ignore_index=True)
        elif self.hyperparams["column_overlap"] == "intersection":
            concated = pd.concat(dataframes, join="inner", ignore_index=True)

        if self.hyperparams["remove_duplicate_rows"]:
            concated.drop_duplicates(subset="d3mIndex",
                                     keep="first",
                                     inplace=True,
                                     ignore_index=True)

        if metadata is None:
            metadata = container.Dataset({
                "learningData": concated.head(1)
            },
                                         generate_metadata=True).metadata
        outputs = container.Dataset({"learningData": concated}, metadata)
        outputs.metadata = outputs.metadata.update(
            (metadata_base.ALL_ELEMENTS, ),
            {"dimension": {
                "length": concated.shape[0]
            }})

        return base.CallResult(outputs)
Esempio n. 8
0
    def multi_produce(self, *, inputs1: Inputs, inputs2: Inputs, produce_methods: typing.Sequence[str],
                      timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        results = []
        for method_name in produce_methods:
            if method_name != 'produce' and not method_name.startswith('produce_'):
                raise exceptions.InvalidArgumentValueError(
                    "Invalid produce method name '{method_name}'.".format(method_name=method_name))

            if not hasattr(self, method_name):
                raise exceptions.InvalidArgumentValueError(
                    "Unknown produce method name '{method_name}'.".format(method_name=method_name))

            try:
                expected_arguments = set(self.metadata.query()['primitive_code'].get(
                    'instance_methods', {})[method_name]['arguments'])
            except KeyError as error:
                raise exceptions.InvalidArgumentValueError(
                    "Unknown produce method name '{method_name}'.".format(method_name=method_name)) from error

            arguments = {'inputs1': inputs1,
                         'inputs2': inputs2,
                         }

            start = time.perf_counter()
            results.append(getattr(self, method_name)(
                timeout=timeout, **arguments))
            delta = time.perf_counter() - start

            # Decrease the amount of time available to other calls. This delegates responsibility
            # of raising a "TimeoutError" exception to produce methods themselves. It also assumes
            # that if one passes a negative timeout value to a produce method, it raises a
            # "TimeoutError" exception correctly.
            if timeout is not None:
                timeout -= delta

        # We return the maximum number of iterations done by any produce method we called.
        iterations_done = None
        for result in results:
            if result.iterations_done is not None:
                if iterations_done is None:
                    iterations_done = result.iterations_done
                else:
                    iterations_done = max(
                        iterations_done, result.iterations_done)

        return MultiCallResult(
            values={name: result.value for name,
                    result in zip(produce_methods, results)},
            has_finished=all(result.has_finished for result in results),
            iterations_done=iterations_done,
        )
Esempio n. 9
0
def set_target_column(dataset):
    """
        Function used for unit test
    """

    # TODO: Cannot assume resource_id '0' exists
    resource_id = '0'
    for index in range(
            dataset.metadata.query(
                (resource_id, ALL_ELEMENTS))['dimension']['length'] - 1, -1,
            -1):
        column_semantic_types = dataset.metadata.query(
            (resource_id, ALL_ELEMENTS, index))['semantic_types']
        if 'https://metadata.datadrivendiscovery.org/types/SuggestedTarget' in \
                column_semantic_types:
            column_semantic_types = list(column_semantic_types) + [
                'https://metadata.datadrivendiscovery.org/types/Target',
                'https://metadata.datadrivendiscovery.org/types/TrueTarget'
            ]
            dataset.metadata = dataset.metadata.update(
                (resource_id, ALL_ELEMENTS, index),
                {'semantic_types': column_semantic_types})
            return

    raise exceptions.InvalidArgumentValueError(
        'At least one column should have semantic type SuggestedTarget')
Esempio n. 10
0
    def select_columns(self: D,
                       columns: typing.Sequence[
                           metadata_base.SimpleSelectorSegment],
                       *,
                       allow_empty_columns: bool = False) -> D:
        """
        Returns a new DataFrame with data and metadata only for given ``columns``.
        Moreover, columns are renumbered based on the position in ``columns`` list.
        Top-level metadata stays unchanged, except for updating the length of the columns dimension to
        the number of columns.

        So if the ``columns`` is ``[3, 6, 5]`` then output DataFrame will have three columns, ``[0, 1, 2]``,
        mapping data and metadata for columns ``3`` to ``0``, ``6`` to ``1`` and ``5`` to ``2``.

        This allows also duplication of columns.
        """

        if not columns and not allow_empty_columns:
            raise exceptions.InvalidArgumentValueError("No columns selected.")

        output = self.iloc[:, list(columns)]

        # We want to make sure it is a true copy.
        if output._is_view:
            output = output.copy()
        else:
            output._set_is_copy(copy=False)

        output.metadata = self.metadata.select_columns(
            columns, allow_empty_columns=allow_empty_columns)

        return output
Esempio n. 11
0
def crawl_openml_handler(
    arguments: argparse.Namespace,
    *,
    pipeline_resolver: typing.Callable = None,
    dataset_resolver: typing.Callable = None,
    problem_resolver: typing.Callable = None,
) -> None:
    if pipeline_resolver is None:
        pipeline_resolver = pipeline_module.get_pipeline
    if dataset_resolver is None:
        dataset_resolver = dataset_module.get_dataset
    if problem_resolver is None:
        problem_resolver = problem_module.get_problem

    context = metadata_base.Context[arguments.context]
    compute_digest = dataset_module.ComputeDigest[getattr(
        arguments, 'compute_digest',
        dataset_module.ComputeDigest.ONLY_IF_MISSING.name)]
    runtime_environment = pipeline_run_module.RuntimeEnvironment(
        worker_id=getattr(arguments, 'worker_id', None), )

    task_types = [
        problem_module.OpenMLTaskType[task_type]
        for task_type in arguments.task_types
    ]
    if utils.has_duplicates(task_types):
        raise exceptions.InvalidArgumentValueError(
            "Same task type listed multiple times.")

    assert task_types

    inputs_config = runtime._get_inputs_config_from_arguments(
        arguments=arguments,
        pipeline_resolver=pipeline_resolver,
        dataset_resolver=dataset_resolver,
    )

    assert inputs_config.data_pipeline

    has_errored = crawl_openml(
        save_dir=arguments.save_dir,
        task_types=task_types,
        data_pipeline=inputs_config.data_pipeline,
        data_params=inputs_config.data_params,
        context=context,
        random_seed=inputs_config.data_random_seed,
        volumes_dir=getattr(arguments, 'volumes_dir', None),
        scratch_dir=getattr(arguments, 'scratch_dir', None),
        runtime_environment=runtime_environment,
        max_tasks=arguments.max_tasks,
        ignore_tasks=arguments.ignore_tasks or [],
        ignore_datasets=arguments.ignore_datasets or [],
        dataset_resolver=dataset_resolver,
        problem_resolver=problem_resolver,
        compute_digest=compute_digest,
        strict_digest=getattr(arguments, 'strict_digest', False),
    )

    if has_errored:
        sys.exit(1)
Esempio n. 12
0
    def produce(self, *,
                left: Inputs,  # type: ignore
                right: Inputs,  # type: ignore
                timeout: float = None,
                iterations: int = None) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = utils.get_tabular_resource(left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in left dataset") from error

        try:
            right_resource_id, right_df = utils.get_tabular_resource(right, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError("Failure to find tabular resource in right dataset") from error

        accuracy = self.hyperparams['accuracy']
        if accuracy <= 0.0 or accuracy > 1.0:
            raise exceptions.InvalidArgumentValueError('accuracy of ' + str(accuracy) + ' is out of range')

        left_col = self.hyperparams['left_col']
        right_col = self.hyperparams['right_col']

        # perform join based on semantic type
        join_type = self._get_join_semantic_type(left, left_resource_id, left_col, right, right_resource_id, right_col)
        joined: pd.Dataframe = None
        if join_type in self._STRING_JOIN_TYPES:
            joined = self._join_string_col(left_df, left_col, right_df, right_col, accuracy)
        elif join_type in self._NUMERIC_JOIN_TYPES:
            joined = self._join_numeric_col(left_df, left_col, right_df, right_col, accuracy)
        elif join_type in self._DATETIME_JOIN_TYPES:
            joined = self._join_datetime_col(left_df, left_col, right_df, right_col, accuracy)
        else:
            raise exceptions.InvalidArgumentValueError('join not surpported on type ' + str(join_type))

        # create a new dataset to hold the joined data
        resource_map = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource
        result_dataset = container.Dataset(resource_map)

        return base.CallResult(result_dataset)
 def _get_value_indices(self, inputs_metadata):
     value_indices = self.hyperparams["value_cols"]
     if value_indices and len(value_indices) > 0:
         return value_indices
     value_indices = inputs_metadata.list_columns_with_semantic_types(
         self._target_semantic)
     if len(value_indices) > 0:
         return value_indices
     raise exceptions.InvalidArgumentValueError("no columns with target")
Esempio n. 14
0
    def __init__(self, other: typing.Dict[str, typing.Any] = None, **values: typing.Any) -> None:
        if other is None:
            other = {}

        values = dict(other, **values)

        params_keys = set(self.__params_items__.keys())  # type: ignore
        values_keys = set(values.keys())

        missing = params_keys - values_keys
        if len(missing):
            raise exceptions.InvalidArgumentValueError("Not all parameters are specified: {missing}".format(missing=missing))

        extra = values_keys - params_keys
        if len(extra):
            raise exceptions.InvalidArgumentValueError("Additional parameters are specified: {extra}".format(extra=extra))

        super().__init__(values)
 def _get_time_index(self, inputs_metadata):
     time_index = self.hyperparams["time_col"]
     if time_index:
         return time_index
     time_indices = inputs_metadata.list_columns_with_semantic_types(
         self._time_semantic)
     if len(time_indices) > 0:
         return time_indices[0]
     raise exceptions.InvalidArgumentValueError("no column with time")
Esempio n. 16
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        file_index = self.hyperparams['file_col_index']

        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) + ' from ' +
                    str(inputs.columns) + ' does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from ' + str(inputs.columns) +
                    ' contains csv file names')

        value_index = self.hyperparams['value_col_index']
        time_index = self.hyperparams['time_col_index']

        # load each time series file, transpose, and append
        base_path = inputs.metadata.query(
            (metadata_base.ALL_ELEMENTS, file_index))['location_base_uris'][0]
        timeseries_dataframe: pd.DataFrame
        for idx, file_path in enumerate(inputs.iloc[:, file_index]):
            csv_path = os.path.join(base_path, file_path)
            timeseries_row = pd.read_csv(csv_path).transpose()
            # use the time values as the column headers
            if idx is 0:
                timeseries_dataframe = pd.DataFrame(
                    columns=timeseries_row.iloc[time_index])

            timeseries_dataframe = timeseries_dataframe.append(
                timeseries_row.iloc[value_index])

        # get the index to use a range of ints rather than the value col name
        timeseries_dataframe = timeseries_dataframe.reset_index(drop=True)

        # wrap as a D3M container - metadata should be auto generated
        return base.CallResult(container.DataFrame(data=timeseries_dataframe))
 def _get_grouping_key_index(self, inputs_metadata):
     group_key_index = self.hyperparams["grouping_key_col"]
     if group_key_index:
         return group_key_index
     grouping_key_indices = inputs_metadata.list_columns_with_semantic_types(
         self._grouping_key_semantic)
     if len(grouping_key_indices) > 0:
         return grouping_key_indices[0]
     raise exceptions.InvalidArgumentValueError(
         "no column with grouping key")
    def log_likelihoods(self,
                        *,
                        outputs: Outputs,
                        inputs: Inputs,
                        timeout: float = None,
                        iterations: int = None) -> CallResult[Sequence[float]]:
        inputs = inputs.iloc[:, self._training_indices]  # Get ndarray
        outputs = outputs.iloc[:, self._target_column_indices]

        if len(inputs.columns) and len(outputs.columns):

            if outputs.shape[1] != self._n_classes:
                raise exceptions.InvalidArgumentValueError(
                    "\"outputs\" argument does not have the correct number of target columns."
                )

            log_proba = self._predict_log_proba(inputs, self._weights)

            # Making it always a list, even when only one target.
            if self._n_classes == 1:
                log_proba = [log_proba]
                classes = [self._classes_]
            else:
                classes = self._classes_

            samples_length = inputs.shape[0]

            log_likelihoods = []
            for k in range(self._n_classes):
                # We have to map each class to its internal (numerical) index used in the learner.
                # This allows "outputs" to contain string classes.
                outputs_column = outputs.iloc[:, k]
                classes_map = pandas.Series(np.arange(len(classes[k])),
                                            index=classes[k])
                mapped_outputs_column = outputs_column.map(classes_map)

                # For each target column (column in "outputs"), for each sample (row) we pick the log
                # likelihood for a given class.
                log_likelihoods.append(log_proba[k][np.arange(samples_length),
                                                    mapped_outputs_column])

            results = d3m_dataframe(dict(enumerate(log_likelihoods)),
                                    generate_metadata=True)
            results.columns = outputs.columns

            for k in range(self._n_classes):
                column_metadata = outputs.metadata.query_column(k)
                if 'name' in column_metadata:
                    results.metadata = results.metadata.update_column(
                        k, {'name': column_metadata['name']})

        else:
            results = d3m_dataframe(generate_metadata=True)

        return CallResult(results)
Esempio n. 19
0
    def produce(
            self,
            *,
            inputs: container.Dataset,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        main_resource_index = self.hyperparams['main_resource_index']
        if main_resource_index is None:
            raise exceptions.InvalidArgumentValueError(
                'no main resource specified')

        file_index = self.hyperparams['file_col_index']
        if file_index is not None:
            if not self._is_csv_file_column(inputs.metadata,
                                            main_resource_index, file_index):
                raise exceptions.InvalidArgumentValueError(
                    'column idx=' + str(file_index) +
                    ' from does not contain csv file names')
        else:
            file_index = self._find_csv_file_column(inputs.metadata)
            if file_index is None:
                raise exceptions.InvalidArgumentValueError(
                    'no column from contains csv file names')

        # generate the long form timeseries data
        base_path = self._get_base_path(inputs.metadata, main_resource_index,
                                        file_index)

        csv_paths = [
            os.path.join(base_path, f)
            for f in inputs[main_resource_index].iloc[:, file_index]
        ]
        ts_values = [pd.read_csv(path) for path in csv_paths]
        for ts, val in zip(ts_values, inputs[main_resource_index].values):
            ts[list(inputs[main_resource_index])] = pd.DataFrame(
                [list(val)], index=ts.index)
        timeseries_dataframe = pd.concat(ts_values)
        timeseries_dataframe = container.DataFrame(timeseries_dataframe)
        return base.CallResult(
            container.Dataset({'0': timeseries_dataframe},
                              generate_metadata=True))
Esempio n. 20
0
    def fit_multi_produce(self,
                          *,
                          produce_methods: Sequence[str],
                          inputs: Inputs,
                          timeout: float = None,
                          iterations: int = None) -> MultiCallResult:
        self.set_training_data(inputs=inputs)  # type: ignore

        method_name = produce_methods[0]
        if method_name != 'produce':
            raise exceptions.InvalidArgumentValueError(
                "Invalid produce method name '{method_name}'.".format(
                    method_name=method_name))

        result = self.fit(timeout=timeout, iterations=iterations)

        return MultiCallResult(values={method_name: result.value}, )
 def _granularityToRule(self):
     granularity = self.hyperparams["granularity"]
     if granularity == "seconds":
         return "S"
     elif granularity == "minutes":
         return "T"
     elif granularity == "hours":
         return "H"
     elif granularity == "days":
         return "D"
     elif granularity == "weeks":
         return "W"
     elif granularity == "months":
         return "M"
     elif granularity == "years":
         return "A"
     raise exceptions.InvalidArgumentValueError(
         "Given granularity argument not supported")
Esempio n. 22
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> base.CallResult[Outputs]:
        primitive_1 = self.hyperparams['primitive_1']
        primitive_2 = self.hyperparams['primitive_2']

        results = []

        if primitive_1 is not None:
            start = time.perf_counter()
            results.append(primitive_1.produce(inputs=inputs, timeout=timeout, iterations=iterations))
            delta = time.perf_counter() - start

            # Decrease the amount of time available to other calls. This delegates responsibility
            # of raising a "TimeoutError" exception to produce methods themselves. It also assumes
            # that if one passes a negative timeout value to a produce method, it raises a
            # "TimeoutError" exception correctly.
            if timeout is not None:
                timeout -= delta

        if primitive_2 is not None:
            results.append(primitive_2.produce(inputs=inputs, timeout=timeout, iterations=iterations))

        if not results:
            raise exceptions.InvalidArgumentValueError("No primitives provided as hyper-parameters.")

        # Even if the structure of outputs is the same as inputs, conceptually, outputs are different,
        # they are new data. So we do not reuse metadata from inputs but generate new metadata.
        outputs = container.List([sum(x) for x in zip(*[result.value for result in results])], generate_metadata=True)

        # We return the maximum number of iterations done by any produce method we called.
        iterations_done = None
        for result in results:
            if result.iterations_done is not None:
                if iterations_done is None:
                    iterations_done = result.iterations_done
                else:
                    iterations_done = max(iterations_done, result.iterations_done)

        return base.CallResult(
            value=outputs,
            has_finished=all(result.has_finished for result in results),
            iterations_done=iterations_done,
        )
Esempio n. 23
0
    def bind_primitive_IO(self, primitive: PrimitiveStep, templateIO):
        # print(templateIO)
        if len(templateIO) > 0:
            primitive.add_argument(
                name="inputs",
                argument_type=metadata_base.ArgumentType.CONTAINER,
                data_reference=templateIO[0])

        if len(templateIO) > 1:
            arguments = primitive.primitive.metadata.query()['primitive_code'][
                'instance_methods']['set_training_data']['arguments']
            if "outputs" in arguments:
                # Some primitives (e.g. GreedyImputer) require "outputs", while others do
                # not (e.g. MeanImputer)
                primitive.add_argument("outputs",
                                       metadata_base.ArgumentType.CONTAINER,
                                       templateIO[1])
        if len(templateIO) > 2:
            raise exceptions.InvalidArgumentValueError(
                "Should be less than 3 arguments!")
Esempio n. 24
0
def get_primitive_by_id(primitive_id: str) -> typing.Type[base.PrimitiveBase]:
    """
    Returns a primitive class based on its ID from all currently loaded primitives.

    Parameters
    ----------
    primitive_id:
        An ID of a primitive.

    Returns
    -------
    A primitive class.
    """

    for primitive in get_loaded_primitives():
        if primitive.metadata.query()['id'] == primitive_id:
            return primitive

    raise exceptions.InvalidArgumentValueError(
        "Unable to get primitive '{primitive_id}'.".format(
            primitive_id=primitive_id))
Esempio n. 25
0
    def _produce(
        self,
        *,
        left_df_full: container.DataFrame, # type: ignore
        left_df: container.DataFrame,  # type: ignore
        right_df: container.DataFrame,  # type: ignore
        join_types: typing.Sequence[str],
        left_col: typing.Sequence[int],
        right_col: typing.Sequence[int],
        accuracy: typing.Sequence[float],
        absolute_accuracy: typing.Sequence[bool]
    ) -> base.CallResult[Outputs]:

        # cycle through the columns to join the dataframes
        right_cols_to_drop = []
        new_left_cols = []
        new_right_cols = []
        for col_index in range(len(left_col)):
            # depending on the joining type, make a new dataframe that has columns we will want to merge on
            # keep track of which columns we will want to drop later on
            if len(self._STRING_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df = self._create_string_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                )
                left_df[new_left_df.columns] = new_left_df
                right_name = "righty_string" + str(col_index)
                right_df.rename(
                    columns={right_col[col_index]: right_name}, inplace=True
                )
                new_left_cols += list(new_left_df.columns)
                new_right_cols.append(right_name)
            elif len(self._NUMERIC_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df = self._create_numeric_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_name = "righty_numeric" + str(col_index)
                right_df.rename(
                    columns={right_col[col_index]: right_name}, inplace=True
                )
                new_left_cols += list(new_left_df.columns)
                new_right_cols.append(right_name)
            elif len(self._GEO_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df, new_right_df = self._create_geo_vector_merging_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            elif len(self._VECTOR_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df, new_right_df = self._create_vector_merging_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            elif len(self._DATETIME_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                tolerance = self._compute_datetime_tolerance(left_df_full, left_col[col_index], right_df, right_col[col_index], accuracy[col_index])
                new_left_df, new_right_df = self._create_datetime_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    tolerance,
                    col_index,
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            else:
                raise exceptions.InvalidArgumentValueError(
                    "join not surpported on type " + str(join_types[col_index])
                )

        if "d3mIndex" in right_df.columns:
            right_cols_to_drop.append("d3mIndex")
        right_df.drop(columns=right_cols_to_drop, inplace=True)

        joined = pd.merge(
            left_df,
            right_df,
            how=self.hyperparams["join_type"],
            left_on=new_left_cols,
            right_on=new_right_cols,
            suffixes=["_left", "_right"],
        )

        # don't want to keep columns that were created specifically for merging
        # also, inner merge keeps the right column we merge on, we want to remove it
        joined.drop(columns=new_left_cols + new_right_cols, inplace=True)

        return joined
Esempio n. 26
0
    def produce(
        self,
        *,
        left: Inputs,  # type: ignore
        right: Inputs,  # type: ignore
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[Outputs]:

        # attempt to extract the main table
        try:
            left_resource_id, left_df = d3m_base_utils.get_tabular_resource(left, None)
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in left dataset"
            ) from error

        try:
            right_resource_id, right_df = d3m_base_utils.get_tabular_resource(
                right, None
            )
        except ValueError as error:
            raise exceptions.InvalidArgumentValueError(
                "Failure to find tabular resource in right dataset"
            ) from error

        accuracy = self.hyperparams["accuracy"]
        absolute_accuracy = self.hyperparams["absolute_accuracy"]

        # hyperparams may be parsed as tuples
        # floats could be integers if round number is passed in
        if isinstance(accuracy, collections.Iterable):
            accuracy = [float(a) for a in accuracy]
        else:
            accuracy = float(accuracy)
        if isinstance(absolute_accuracy, collections.Iterable):
            absolute_accuracy = list(absolute_accuracy)

        if type(accuracy) == float and not type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 value of accuracy provided, but multiple values for absolute accuracy provided"
            )
        if (not type(accuracy) == float) and type(absolute_accuracy) == bool:
            raise exceptions.InvalidArgumentValueError(
                "only 1 for absolute accuracy provided, but multiple values of accuracy provided"
            )
        if type(accuracy) == float and not absolute_accuracy:
            if accuracy <= 0.0 or accuracy > 1.0:
                raise exceptions.InvalidArgumentValueError(
                    "accuracy of " + str(accuracy) + " is out of range"
                )
        elif type(accuracy) == list and type(absolute_accuracy) == list:
            if not len(accuracy) == len(absolute_accuracy):
                raise exceptions.InvalidArgumentValueError(
                    "the count of accuracy hyperparams does not match the count of absolute_accuracy hyperparams"
                )
            for i in range(len(accuracy)):
                if (accuracy[i] <= 0.0 or accuracy[i] > 1.0) and not absolute_accuracy[i]:
                    raise exceptions.InvalidArgumentValueError(
                        "accuracy of " + str(acc) + " is out of range"
                    )

        left_col = self.hyperparams["left_col"]
        right_col = self.hyperparams["right_col"]

        if type(left_col) != type(right_col) or (
            type(left_col) == list
            and len(left_col) != len(right_col)
            and type(accuracy) != list
            and len(accuracy) != len(left_col)
        ):
            raise exceptions.InvalidArgumentTypeError(
                "both left_col and right_col need to have same data type and if they are lists, the same list lengths"
            )
        if type(left_col) == str:
            left_col = [left_col]
            right_col = [right_col]
            accuracy = [accuracy]
            absolute_accuracy = [absolute_accuracy]

        join_types = [
            self._get_join_semantic_type(
                left,
                left_resource_id,
                left_col[i],
                right,
                right_resource_id,
                right_col[i],
            )
            for i in range(len(left_col))
        ]

        num_splits = 32
        joined_split = [None for i in range(num_splits)]
        left_df_split = np.array_split(left_df, num_splits)
        jobs = [delayed(self._produce_threaded)(
            index = i,
            left_df_full = left_df,
            left_dfs = left_df_split,
            right_df = right_df,
            join_types = join_types,
            left_col = left_col,
            right_col = right_col,
            accuracy = accuracy,
            absolute_accuracy = absolute_accuracy
        ) for i in range(num_splits)]
        joined_data = Parallel(n_jobs=self.hyperparams["n_jobs"], backend="loky", verbose=10)(jobs)

        # joined data needs to maintain order to mimic none split joining
        for i, d in joined_data:
            joined_split[i] = d
        joined = pd.concat(joined_split, ignore_index = True)

        # create a new dataset to hold the joined data
        resource_map = {}
        float_vector_columns = {}
        for resource_id, resource in left.items():  # type: ignore
            if resource_id == left_resource_id:
                for column in joined.columns:
                    # need to avoid bug in container.Dataset, it doesn't like vector columns
                    if type(joined[column].iloc[0]) == np.ndarray:
                        float_vector_columns[column] = joined[column]
                        joined[column] = np.NAN
                resource_map[resource_id] = joined
            else:
                resource_map[resource_id] = resource

        # Generate metadata for the dataset using only the first row of the resource for speed -
        # metadata generation runs over each cell in the dataframe, but we only care about column
        # level generation.  Once that's done, set the actual dataframe value.
        result_dataset = container.Dataset(
            {k: v.head(1) for k, v in resource_map.items()}, generate_metadata=True
        )
        for k, v in resource_map.items():
            result_dataset[k] = v
            result_dataset.metadata = result_dataset.metadata.update(
                (k,), {"dimension": {"length": v.shape[0]}}
            )

        for key in float_vector_columns.keys():
            df = result_dataset[left_resource_id]
            df[key] = float_vector_columns[key]
            float_vec_loc = df.columns.get_loc(key)
            float_vec_col_indices = df.metadata.list_columns_with_semantic_types(
                ("https://metadata.datadrivendiscovery.org/types/FloatVector",)
            )
            if float_vec_loc not in float_vec_col_indices:
                df.metadata = df.metadata.add_semantic_type(
                    (metadata_base.ALL_ELEMENTS, float_vec_loc),
                    "https://metadata.datadrivendiscovery.org/types/FloatVector",
                )

        return base.CallResult(result_dataset)
Esempio n. 27
0
    def _get_truth(self, score_dataset: container.Dataset) -> typing.Tuple[pandas.DataFrame, typing.Dict[str, typing.Any]]:
        """
        Extracts true targets from the Dataset's entry point, or the only tabular resource.
        It requires that there is only one primary index column, which it makes the first
        column, named ``d3mIndex``. Then true target columns follow.

        We return a regular Pandas DataFrame with column names matching those in the metadata,
        and a dict mapping target columns to all label values in those columns, if available in metadata.
        We convert all columns to strings to match what would be loaded from ``predictions.csv`` file.
        It encodes any float vectors as strings.
        """

        main_resource_id, main_resource = base_utils.get_tabular_resource(score_dataset, None, has_hyperparameter=False)

        # We first copy before modifying in-place.
        main_resource = container.DataFrame(main_resource, copy=True)
        main_resource = self._encode_columns(main_resource)

        dataframe = self._to_dataframe(main_resource)

        indices = list(score_dataset.metadata.get_index_columns(at=(main_resource_id,)))
        targets = list(score_dataset.metadata.list_columns_with_semantic_types(
            ['https://metadata.datadrivendiscovery.org/types/TrueTarget'],
            at=(main_resource_id,),
        ))

        if not indices:
            raise exceptions.InvalidArgumentValueError("No primary index column.")
        elif len(indices) > 1:
            raise exceptions.InvalidArgumentValueError("More than one primary index column.")
        if not targets:
            raise ValueError("No true target columns.")

        dataframe = dataframe.iloc[:, indices + targets]

        dataframe = dataframe.rename(columns={dataframe.columns[0]: metrics.INDEX_COLUMN})

        if metrics.SCORE_COLUMN in dataframe.columns[1:]:
            raise ValueError("True target column cannot be named \"confidence\". It is a reserved name.")
        if metrics.RANK_COLUMN in dataframe.columns[1:]:
            raise ValueError("True target column cannot be named \"rank\". It is a reserved name.")
        if metrics.INDEX_COLUMN in dataframe.columns[1:]:
            raise ValueError("True target column cannot be named \"d3mIndex\". It is a reserved name.")

        if d3m_utils.has_duplicates(dataframe.columns):
            duplicate_names = list(dataframe.columns)
            for name in set(dataframe.columns):
                duplicate_names.remove(name)
            raise exceptions.InvalidArgumentValueError(
                "True target columns have duplicate names: {duplicate_names}".format(
                    duplicate_names=sorted(set(duplicate_names)),
                ),
            )

        all_labels = {}

        for target_column_name, main_resource_column_index in zip(dataframe.columns[1:], targets):
            try:
                column_labels = score_dataset.metadata.query_column_field(main_resource_column_index, 'all_distinct_values', at=(main_resource_id,))
            except KeyError:
                continue

            all_labels[target_column_name] = [str(label) for label in column_labels]

        return dataframe, all_labels
Esempio n. 28
0
    def produce(  # type: ignore
        self, *, inputs: Inputs, score_dataset: container.Dataset, timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[Outputs]:
        if not self.hyperparams['metrics']:
            raise ValueError("\"metrics\" hyper-parameter cannot be empty.")

        truth, all_labels = self._get_truth(score_dataset)
        predictions = self._get_predictions(inputs)

        for target_column in self.hyperparams['all_labels']:
            all_labels[target_column['column_name']] = list(target_column['labels'])

        outputs: typing.Dict[str, typing.List] = {
            'metric': [],
            'value': [],
        }

        if self.hyperparams['add_normalized_scores']:
            outputs['normalized'] = []

        for metric_configuration in self.hyperparams['metrics']:
            metric = problem.PerformanceMetric[metric_configuration['metric']]
            metric_class = metric.get_class()

            params = {}

            if 'all_labels' in inspect.signature(metric_class).parameters and all_labels:
                params['all_labels'] = all_labels

            for param_name, param_value in metric_configuration.items():
                if param_name == 'metric':
                    continue
                if param_value is None:
                    continue
                params[param_name] = param_value

            if metric.requires_score() and metrics.SCORE_COLUMN not in predictions.columns:
                raise exceptions.InvalidArgumentValueError(
                    f"Metric {metric.name} requires score column in predictions, but it is not available.",
                )
            if metric.requires_rank() and metrics.RANK_COLUMN not in predictions.columns:
                raise exceptions.InvalidArgumentValueError(
                    f"Metric {metric.name} requires rank column in predictions, but it is not available.",
                )

            score = metric_class(**params).score(truth, predictions)

            outputs['metric'].append(metric.name)
            outputs['value'].append(score)

            if self.hyperparams['add_normalized_scores']:
                outputs['normalized'].append(metric.normalize(score))

        # Dictionary key order is preserved in Python 3.6+ which makes column order as we want it.
        results = container.DataFrame(data=outputs, columns=list(outputs.keys()), generate_metadata=True)

        # Not really necessary, but it does not hurt. In theory somebody could list same metric multiple times
        # (maybe with different params), so we use "PrimaryMultiKey" here.
        results.metadata = results.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            'https://metadata.datadrivendiscovery.org/types/PrimaryMultiKey',
        )
        results.metadata = results.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 1),
            'https://metadata.datadrivendiscovery.org/types/Score',
        )
        if self.hyperparams['add_normalized_scores']:
            results.metadata = results.metadata.add_semantic_type(
                (metadata_base.ALL_ELEMENTS, 2),
                'https://metadata.datadrivendiscovery.org/types/Score',
            )

        return base.CallResult(results)
Esempio n. 29
0
def combine_columns(
    inputs: container.DataFrame,
    column_indices: typing.Sequence[int],
    columns_list: typing.Sequence[container.DataFrame],
    *,
    return_result: str,
    add_index_columns: bool,
) -> container.DataFrame:
    """
    Method which appends existing columns, replaces them, or creates new result from them, based on
    ``return_result`` argument, which can be ``append``, ``replace``, or ``new``.

    ``add_index_columns`` controls if when creating a new result, primary index columns should be added
    if they are not already among columns.

    ``inputs`` is a DataFrame for which we are appending on replacing columns, or if we are creating new result,
    from where a primary index column can be taken.

    ``column_indices`` controls which columns in ``inputs`` were used to create ``columns_list``,
    and which columns should be replaced when replacing them.

    ``columns_list`` is a list of DataFrames representing all together new columns. The reason it is a list is
    to make it easier to operate per-column when preparing ``columns_list`` and not have to concat them all
    together unnecessarily.

    Top-level metadata in ``columns_list`` is ignored, except when creating new result.
    In that case top-level metadata from the first element in the list is used.

    When ``column_indices`` columns are being replaced with ``columns_list``, existing metadata in ``column_indices``
    columns is not preserved but replaced with metadata in ``columns_list``. Ideally, metadata for ``columns_list``
    has been constructed by copying source metadata from ``column_indices`` columns and modifying it as
    necessary to adapt it to new columns. But ``columns_list`` also can have completely new metadata, if this
    is more reasonable, but it should be understood that in this case when replacing ``column_indices``
    columns, any custom additional metadata on those columns will be lost.

    ``column_indices`` and ``columns_list`` do not have to match in number of columns. Columns are first
    replaced in order for matching indices and columns. If then there are more ``column_indices`` than
    ``columns_list``, additional ``column_indices`` columns are removed. If there are more ``columns_list`` than
    ``column_indices`` columns, then additional ``columns_list`` are inserted after the last replaced column.

    If ``column_indices`` is empty, then the replacing behavior is equivalent to appending.
    """

    if return_result == 'append':
        outputs = inputs
        for columns in columns_list:
            outputs = outputs.append_columns(columns)

    elif return_result == 'replace':
        if not column_indices:
            return combine_columns(inputs,
                                   column_indices,
                                   columns_list,
                                   return_result='append',
                                   add_index_columns=add_index_columns)

        # Compute the difference in "columns"
        to_be_added = list(
            numpy.setdiff1d(numpy.arange(len(inputs.columns)), column_indices))
        columns_replaced = 0
        if len(to_be_added) < len(column_indices):
            # More efficient to concatenate than replace one-by-one
            outputs = pandas.concat(columns_list, axis=1)
            outputs = container.DataFrame(data=outputs,
                                          generate_metadata=False)
            indices = range(columns_list[0].shape[1])
            outputs.metadata = inputs.metadata.select_columns(
                columns=list(indices))

            c = 0
            for columns in columns_list:
                columns_length = columns.shape[1]
                if c == 0:
                    outputs.metadata = outputs.metadata.replace_columns(
                        columns.metadata, list(indices))
                else:
                    outputs.metadata = outputs.metadata.append_columns(
                        columns.metadata)
                c += 1

            for col in to_be_added:
                insert_index = col.item()
                if insert_index > outputs.shape[1]:
                    insert_index = outputs.shape[1]
                outputs = outputs.insert_columns(
                    inputs.select_columns([col.item()]), insert_index)
            outputs.metadata = outputs.metadata.compact(['structural_type'])
        else:
            # We copy here and disable copying inside "replace_columns" to copy only once.
            # We have to copy because "replace_columns" is modifying data in-place.
            outputs = copy.copy(inputs)
            for columns in columns_list:
                columns_length = columns.shape[1]
                if columns_replaced < len(column_indices):
                    # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns
                    # listed in the slice will be replaced and others appended after the last replaced column.
                    outputs = outputs.replace_columns(
                        columns,
                        column_indices[columns_replaced:columns_replaced +
                                       columns_length],
                        copy=False)
                else:
                    # We insert the rest of columns after the last columns we replaced. We know that "column_indices"
                    # is non-empty and that the last item of "column_indices" points ot the last column we replaced
                    # for those listed in "column_indices". We replaced more columns though, so we have to add the
                    # difference, and then add 1 to insert after the last column.
                    outputs = outputs.insert_columns(
                        columns, column_indices[-1] +
                        (columns_replaced - len(column_indices)) + 1)
                columns_replaced += columns_length

            if columns_replaced < len(column_indices):
                outputs = outputs.remove_columns(
                    column_indices[columns_replaced:len(column_indices)])
    elif return_result == 'new':
        if not any(columns.shape[1] for columns in columns_list):
            raise ValueError("No columns produced.")

        outputs = columns_list[0]
        for columns in columns_list[1:]:
            outputs = outputs.append_columns(columns)

        if add_index_columns:
            inputs_index_columns = inputs.metadata.get_index_columns()
            outputs_index_columns = outputs.metadata.get_index_columns()

            if inputs_index_columns and not outputs_index_columns:
                # Add index columns at the beginning.
                outputs = inputs.select_columns(
                    inputs_index_columns).append_columns(
                        outputs, use_right_metadata=True)

    else:
        raise exceptions.InvalidArgumentValueError(
            "\"return_result\" has an invalid value: {return_result}".format(
                return_result=return_result))

    return outputs
Esempio n. 30
0
def combine_columns_metadata(
    inputs: metadata_base.DataMetadata,
    column_indices: typing.Sequence[int],
    columns_list: typing.Sequence[metadata_base.DataMetadata],
    *,
    return_result: str,
    add_index_columns: bool,
) -> metadata_base.DataMetadata:
    """
    Analogous to ``combine_columns`` but operates only on metadata.
    """

    if return_result == 'append':
        outputs = inputs
        for columns in columns_list:
            outputs = outputs.append_columns(columns)

    elif return_result == 'replace':
        if not column_indices:
            return combine_columns_metadata(
                inputs,
                column_indices,
                columns_list,
                return_result='append',
                add_index_columns=add_index_columns)

        outputs = inputs

        columns_replaced = 0
        for columns in columns_list:
            columns_length = columns.query_field(
                (metadata_base.ALL_ELEMENTS, ), 'dimension')['length']
            if columns_replaced < len(column_indices):
                # It is OK if the slice of "column_indices" is shorter than "columns", Only those columns
                # listed in the slice will be replaced and others appended after the last replaced column.
                outputs = outputs.replace_columns(
                    columns, column_indices[columns_replaced:columns_replaced +
                                            columns_length])
            else:
                # We insert the rest of columns after the last columns we replaced. We know that "column_indices"
                # is non-empty and that the last item of "column_indices" points ot the last column we replaced
                # for those listed in "column_indices". We replaced more columns though, so we have to add the
                # difference, and then add 1 to insert after the last column.
                outputs = outputs.insert_columns(
                    columns, column_indices[-1] +
                    (columns_replaced - len(column_indices)) + 1)
            columns_replaced += columns_length

        if columns_replaced < len(column_indices):
            outputs = outputs.remove_columns(
                column_indices[columns_replaced:len(column_indices)])

    elif return_result == 'new':
        if not any(
                columns_metadata.query_field(
                    (metadata_base.ALL_ELEMENTS, ), 'dimension')['length']
                for columns_metadata in columns_list):
            raise ValueError("No columns produced.")

        outputs = columns_list[0]
        for columns in columns_list[1:]:
            outputs = outputs.append_columns(columns)

        if add_index_columns:
            inputs_index_columns = inputs.get_index_columns()
            outputs_index_columns = outputs.get_index_columns()

            if inputs_index_columns and not outputs_index_columns:
                # Add index columns at the beginning.
                outputs = inputs.select_columns(
                    inputs_index_columns).append_columns(
                        outputs, use_right_metadata=True)

    else:
        raise exceptions.InvalidArgumentValueError(
            "\"return_result\" has an invalid value: {return_result}".format(
                return_result=return_result))

    return outputs