Beispiel #1
0
    def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
        primary_key_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/PrimaryKey"]
        )

        unfold_cols = common_utils.list_columns_with_semantic_types(
            metadata=inputs.metadata,
            semantic_types=self.hyperparams["unfold_semantic_types"]
        )

        if not primary_key_cols:
            warnings.warn("Did not find primary key column for grouping. Will not unfold")
            return CallResult(inputs)

        if not unfold_cols:
            warnings.warn("Did not find any column to unfold. Will not unfold")
            return CallResult(inputs)

        primary_key_col_names = [inputs.columns[pos] for pos in primary_key_cols]
        unfold_col_names = [inputs.columns[pos] for pos in unfold_cols]

        if self.hyperparams["use_pipeline_id_semantic_type"]:
            pipeline_id_cols = common_utils.list_columns_with_semantic_types(
                metadata=inputs.metadata,
                semantic_types=["https://metadata.datadrivendiscovery.org/types/PipelineId"]
            )

            if len(pipeline_id_cols) >= 2:
                warnings.warn("Multiple pipeline id columns found. Will use first.")

            if pipeline_id_cols:
                inputs = inputs.sort_values(primary_key_col_names + [inputs.columns[pos] for pos in pipeline_id_cols])
                self._sorted_pipe_ids = sorted(inputs.iloc[:, pipeline_id_cols[0]].unique())
            else:
                warnings.warn(
                    "No pipeline id column found by 'https://metadata.datadrivendiscovery.org/types/PipelineId'")

        new_df = self._get_new_df(inputs=inputs, use_cols=primary_key_cols + unfold_cols)

        groupby_df = inputs.groupby(primary_key_col_names)[unfold_col_names].aggregate(
            lambda x: container.List(x)).reset_index(drop=False)

        ret_df = container.DataFrame(groupby_df)
        ret_df.metadata = new_df.metadata
        ret_df = self._update_metadata_dimension(df=ret_df)

        split_col_names = [inputs.columns[pos] for pos in unfold_cols]

        ret_df = self._split_aggregated(df=ret_df, split_col_names=split_col_names)
        ret_df = common_utils.remove_columns(
            inputs=ret_df,
            column_indices=[ret_df.columns.get_loc(name) for name in split_col_names]
        )

        return CallResult(ret_df)
Beispiel #2
0
    def fit(self, *, timeout: float = None, iterations: int = None) -> None:

        if self._fitted:
            return

        if self._input_data is None:
            raise ValueError('Missing training(fitting) data.')

        # Look at attribute columns only
        # print('fit in', self._input_data.columns)
        data = self._input_data.copy()
        all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = utils.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ()) == str:
                if pd.isnull(pd.to_numeric(data.iloc[:, element])).sum() == data.shape[0]:
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i]:
                self._empty_columns.append(i)

        _logger.debug('Removing entirely empty columns: {}'.format(data.columns[self._empty_columns]))

        data = utils.remove_columns(data, self._empty_columns, source='ISI DSBox Data Encoder')

        categorical_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata,
                                                                        semantic_types=[
                                                                            "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                                                                            "https://metadata.datadrivendiscovery.org/types/CategoricalData"])
        all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        self._cat_col_index = list(set(all_attributes).intersection(categorical_attributes))
        self._cat_columns = data.columns[self._cat_col_index].tolist()

        _logger.debug('Encoding columns: {}'.format(self._cat_columns))

        mapping = {}
        for column_name in self._cat_columns:
            col = data[column_name]
            temp = self._trim_features(col, self.hyperparams['n_limit'])
            if temp:
                mapping[temp[0]] = temp[1]
        self._mapping = mapping
        self._fitted = True
Beispiel #3
0
    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        if not self._fitted:
            return CallResult(inputs, self._has_finished,
                              self._iterations_done)

        assert isinstance(
            self._model,
            dict), "self._model type must be dict not defaultdict!"

        temp = pd.DataFrame(
            inputs.iloc[:, self._s_cols].apply(lambda x: self._model[
                x.name].transform(x) if x.name in self._model else None))

        outputs = inputs.copy()
        for id_index, od_index in zip(self._s_cols, range(temp.shape[1])):
            outputs.iloc[:, id_index] = temp.iloc[:, od_index]
        lookup = {
            "int": ('http://schema.org/Integer',
                    'https://metadata.datadrivendiscovery.org/types/Attribute')
        }

        for index in self._s_cols:
            old_metadata = dict(
                outputs.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["semantic_types"] = lookup["int"]
            old_metadata["structural_type"] = type(10)
            outputs.metadata = outputs.metadata.update(
                (mbase.ALL_ELEMENTS, index), old_metadata)

        # remove the columns that appeared in produce method but were not in fitted data
        drop_names = set(outputs.columns[self._s_cols]).difference(
            set(self._model.keys()))
        drop_indices = map(lambda a: outputs.columns.get_loc(a), drop_names)
        drop_indices = sorted(drop_indices)
        outputs = common_utils.remove_columns(outputs,
                                              drop_indices,
                                              source='ISI DSBox Data Labler')

        # sanity check and report the results
        if outputs.shape[0] == inputs.shape[0] and \
           outputs.shape[1] == inputs.shape[1] - len(drop_names):
            self._has_finished = True
            self._iterations_done = True
            # print("output:",outputs.head(5))
            return CallResult(d3m_DataFrame(outputs), self._has_finished,
                              self._iterations_done)
        else:
            return CallResult(inputs, self._has_finished,
                              self._iterations_done)
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        columns_list_to_fold = self._mapping.get('foldable_columns', [])
        if len(columns_list_to_fold) == 0:
            return CallResult(inputs, True, 1)
        if inputs.shape[0] > 20000:
            return CallResult(inputs, True, 1)
        self._column_names = list(inputs) if inputs is not None else []
        df = None
        for columns_to_fold in columns_list_to_fold:
            df = self._fold_columns(inputs, columns_to_fold)
        cols_to_drop = list()
        for col_idx, col_name in enumerate(inputs.columns):
            if col_name not in df.columns:
                cols_to_drop.append(col_idx)

        inputs = utils.remove_columns(inputs, cols_to_drop)
        new_df = inputs[0:0]
        for col_name in new_df.columns:
            new_df.loc[:, col_name] = df.loc[:, col_name]

        extends = {}
        for col_name in df.columns:
            if col_name not in new_df.columns:
                extends[col_name] = df.loc[:, col_name].tolist()

        if extends:
            extends_df = d3m_DataFrame.from_dict(extends)
            extends_df.index = new_df.index.copy()
            new_df = utils.append_columns(new_df, extends_df)
            new_df = self._update_type(new_df, list(extends.keys()))

        old_metadata = dict(new_df.metadata.query(()))
        old_metadata["dimension"] = dict(old_metadata["dimension"])
        old_metadata["dimension"]["length"] = new_df.shape[0]
        new_df.metadata = new_df.metadata.update((), old_metadata)

        return CallResult(new_df, True,
                          1) if new_df is not None else CallResult(
                              inputs, True, 1)
    def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]:
        """
        Convert and output the input data into unary encoded format,
        using the trained (fitted) encoder.
        Value unseen in training_inputs would be rounded to nearest value in training_inputs.
        Missing(NaN) cells in a column one-hot encoded would give
        out a row of all-ZERO columns for the target column.
        """
        #if self._target_columns == []:
        #    return CallResult(inputs, True, 1)
        if not self._fitted:
            raise ValueError('Encoder model not fitted. Use fit()')

        # Return if there is nothing to encode
        if len(self._cat_columns)==0:
            return CallResult(inputs, True, 1)

        if isinstance(inputs, pd.DataFrame):
            data = inputs.copy()
        else:
            data = inputs[0].copy()
        data = utils.remove_columns(data, self._empty_columns)
        set_columns = set(data.columns)

        if set_columns != self._all_columns:
            raise ValueError('Columns(features) fed at produce() differ from fitted data.')

        # core part: encode the unary columns
        data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce'))
        data_else = data.drop(self._mapping.keys(),axis=1)
        res = []
        for column_name in data_enc:
            col = data_enc[column_name]
            col.is_copy = False
            # only apply unary encoder when the amount of the numerical data is less than 12
            if self._requirement[column_name]:
                chg_v = lambda x: min(self._mapping[col.name], key=lambda a:abs(a-x)) if x is not None else x
                # only encode the values which is not null
                col[col.notnull()] = col[col.notnull()].apply(chg_v)
                encoded = self.__encode_column(col)
                res.append(encoded)
            else:
                res.append(col)

        if self._text2int:
            texts = data_else.select_dtypes([object])
            le = Label_encoder()
            le.set_params(self._textmapping)
            data_else[texts.columns] = le.transform_pd(texts)
        # transfer the encoded results to dataFrame
        encoded = d3m_DataFrame(pd.concat(res, axis=1))

        # update metadata for existing columns
        for index in range(len(encoded.columns)):
            old_metadata = dict(encoded.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["structural_type"] = int
            old_metadata["semantic_types"] = (
                'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
            encoded.metadata = encoded.metadata.update((mbase.ALL_ELEMENTS, index), old_metadata)
        # after extracting the traget columns, remove these columns from dataFrame
        data_else = utils.remove_columns(data, self._cat_col_index)
        result = utils.horizontal_concat(data_else, encoded)

        return CallResult(result, True, 1)
    def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
        """
        Need training data from set_training_data first.
        The encoder would record specified columns to encode and column values to
        unary encode later in the produce step.
        """
        if self._fitted:
            return

        if self._training_inputs is None:
            raise ValueError('Missing training(fitting) data.')

        data = self._training_inputs.copy()
        all_attributes = utils.list_columns_with_semantic_types(metadata=data.metadata, semantic_types=[
            "https://metadata.datadrivendiscovery.org/types/Attribute"])

        # Remove columns with all empty values, structural type str
        numeric = utils.list_columns_with_semantic_types(
            data.metadata, ['http://schema.org/Integer', 'http://schema.org/Float'])
        numeric = [x for x in numeric if x in all_attributes]
        for element in numeric:
            if data.metadata.query((mbase.ALL_ELEMENTS, element)).get('structural_type', ())==str:
                if pd.isnull(pd.to_numeric(data.iloc[:,element], errors='coerce')).sum() == data.shape[0]:
                    self._empty_columns.append(element)

        # Remove columns with all empty values, structural numeric
        is_empty = pd.isnull(data).sum(axis=0) == data.shape[0]
        for i in all_attributes:
            if is_empty.iloc[i]:
                self._empty_columns.append(i)
        self._empty_columns = list(set(self._empty_columns))
        self._empty_columns.reverse()
        self._empty_columns = container.List(self._empty_columns)
        data = utils.remove_columns(data, self._empty_columns)
        # print('fit', data.shape)

        categorical_attributes = utils.list_columns_with_semantic_types(
            metadata=data.metadata,
            semantic_types=[
                "https://metadata.datadrivendiscovery.org/types/OrdinalData",
                "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                ]
            )
        all_attributes = utils.list_columns_with_semantic_types(
            metadata=data.metadata,
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"]
            )
        self._cat_col_index = container.List(set(all_attributes).intersection(numeric))
        self._cat_columns = container.List(data.columns[self._cat_col_index].tolist())
        #import pdb
        #pdb.set_trace()
        numerical_values = data.iloc[:, self._cat_col_index].apply(
            lambda col: pd.to_numeric(col, errors='coerce'))

        self._all_columns = set(data.columns)

        # mapping
        idict = {}
        for name in self._cat_columns:
            col = numerical_values[name]
            idict[name] = sorted(col.unique())
        self._mapping = idict

        if self._text2int:
            texts = data.drop(self._mapping.keys(),axis=1)
            texts = texts.select_dtypes(include=[object])
            le = Label_encoder()
            le.fit_pd(texts)
            self._textmapping = le.get_params()

        # determine whether to run unary encoder on the given column or not
        data_enc = data.iloc[:, self._cat_col_index].apply(lambda col: pd.to_numeric(col, errors='coerce'))
        for column_name in data_enc:
            col = data_enc[column_name]
            col.is_copy = False
            # only apply unary encoder when the amount of the numerical data is less than 12
            if col.unique().shape[0] < 13:
                self._requirement[column_name] = True
            else:
                self._requirement[column_name] = False

        self._fitted = True

        return CallResult(None, has_finished=True, iterations_done=1)
Beispiel #7
0
    def _process_files(self, inputs: Input):
        fn_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/FileName"])
        all_attributes = DataMetadata.list_columns_with_semantic_types(self=inputs.metadata, \
            semantic_types=["https://metadata.datadrivendiscovery.org/types/Attribute"])
        fn_columns = list(set(all_attributes).intersection(fn_attributes))

        # if no file name columns are detected, default to regular behavior
        if len(fn_columns) == 0:
            return inputs

        # create an empty DataFrame of the required size
        processed_cols = pd.DataFrame("", index = copy.deepcopy(inputs.index), \
            columns = ['text_files_' + str(i) for i in range(len(fn_columns))])

        # for column_index in range(len(fn_columns)):
        for column_index in fn_columns:
            curr_column = copy.deepcopy(inputs.iloc[:, column_index])

            file_loc = inputs.metadata.query(
                (ALL_ELEMENTS, column_index))['location_base_uris']
            file_loc = file_loc[0]  # take the first elem of the tuple
            file_loc = file_loc[7:]  # get rid of 'file://' prefix

            for row_index in range(curr_column.shape[0]):
                text_file = curr_column.iloc[row_index]
                file_path = file_loc + text_file

                with open(file_path, 'rb') as file:
                    doc = file.read()
                doc = "".join(map(chr, doc))
                doc_tokens = re.compile(r"(?u)\b\w\w+\b").findall(
                    doc)  # list of strings

                processed_cols.iloc[row_index,
                                    fn_columns.index(column_index)] = " ".join(
                                        doc_tokens)

        # construct metadata for the newly generated columns
        processed_cols = d3m_DataFrame(processed_cols, generate_metadata=True)

        for column_index in range(processed_cols.shape[1]):
            col_dict = dict(
                processed_cols.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type("text")
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'processed_file_' + str(inputs.shape[1] +
                                                       column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Text',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            processed_cols.metadata = processed_cols.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)

        # concatenate the input with the newly created columns
        updated_inputs = utils.append_columns(inputs, processed_cols)

        # remove the initial FileName columns from the df, if we do this before concatenating we might get an empty dataset error
        updated_inputs = utils.remove_columns(updated_inputs, fn_columns)

        return updated_inputs
Beispiel #8
0
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        # if corex didn't run for any reason, just return the given dataset
        if self.do_nothing:
            return CallResult(inputs, True, 1)

        inputs = self._process_files(inputs)

        if iterations is not None:
            self.max_iter = iterations
        else:
            self.max_iter = 250
        self.model.max_iter = self.max_iter

        # concatenate the columns row-wise
        concat_cols = None
        for column_index in self.text_columns:
            if concat_cols is not None:
                concat_cols = concat_cols.str.cat(inputs.iloc[:, column_index],
                                                  sep=" ")
            else:
                concat_cols = copy.deepcopy(inputs.iloc[:, column_index])
        bow = self.bow.transform(map(self._get_ngrams, concat_cols.ravel()))

        # choose between CorEx and the TfIdf matrix
        if bow.shape[1] > self.hyperparams['threshold']:
            # use CorEx
            self.latent_factors = self.model.transform(bow).astype(float)
        else:
            # just use the bag of words representation
            self.latent_factors = pd.DataFrame(bow.todense())
        # make the columns corex adds distinguishable from other columns

        # remove the selected columns from input and add the latent factors given by corex
        out_df = d3m_DataFrame(inputs, generate_metadata=True)

        self.latent_factors.columns = [
            str(out_df.shape[-1] + i)
            for i in range(self.latent_factors.shape[-1])
        ]

        # create metadata for the corex columns
        corex_df = d3m_DataFrame(self.latent_factors, generate_metadata=True)
        for column_index in range(corex_df.shape[1]):
            col_dict = dict(
                corex_df.metadata.query((ALL_ELEMENTS, column_index)))
            col_dict['structural_type'] = type(1.0)
            # FIXME: assume we apply corex only once per template, otherwise column names might duplicate
            col_dict['name'] = 'corex_' + str(out_df.shape[1] + column_index)
            col_dict['semantic_types'] = (
                'http://schema.org/Float',
                'https://metadata.datadrivendiscovery.org/types/Attribute')

            corex_df.metadata = corex_df.metadata.update(
                (ALL_ELEMENTS, column_index), col_dict)

        # concatenate is --VERY-- slow without this next line
        corex_df.index = out_df.index.copy()

        out_df = utils.append_columns(out_df, corex_df)

        # remove the initial text columns from the df, if we do this before CorEx we can get an empty dataset error
        out_df = utils.remove_columns(out_df, self.text_columns)

        # TO DO : Incorporate timeout, max_iter
        # return CallResult(d3m_DataFrame(self.latent_factors))
        return CallResult(out_df, True, 1)
Beispiel #9
0
    def produce(self, *, inputs: Input, timeout: float = None, iterations: int = None) -> CallResult[Output]:
        """
        Convert and output the input data into encoded format,
        using the trained (fitted) encoder.
        Notice that [colname]_other_ and [colname]_nan columns
        are always kept for one-hot encoded columns.
        """

        self._input_data_copy = inputs.copy()

        # Remove columns with all empty values
        _logger.debug('Removing entirely empty columns: {}'.format(self._input_data_copy.columns[self._empty_columns]))
        self._input_data_copy = utils.remove_columns(self._input_data_copy, self._empty_columns,
                                                     source='ISI DSBox Data Encoder')

        # Return if there is nothing to encode
        if len(self._cat_columns) == 0:
            return CallResult(self._input_data_copy, True, 1)

        _logger.debug('Encoding columns: {}'.format(self._cat_columns))

        data_encode = self._input_data_copy[list(self._mapping.keys())]

        # Get rid of false SettingWithCopyWarning
        data_encode.is_copy = None

        res = []
        for column_name in self._cat_columns:
            feature = data_encode[column_name].copy()
            other_ = lambda x: 'Other' if (x and x not in self._mapping[column_name]) else x
            nan_ = lambda x: x if x else np.nan
            feature.loc[feature.notnull()] = feature[feature.notnull()].apply(other_)
            feature = feature.apply(nan_)
            new_column_names = ['{}_{}'.format(column_name, i) for i in self._mapping[column_name] + ['nan']]
            encoded = pd.get_dummies(feature, dummy_na=True, prefix=column_name)
            missed = [name for name in new_column_names if name not in list(encoded.columns)]
            for m in missed:
                # print('missing', m)
                encoded[m] = 0
            encoded = encoded[new_column_names]
            res.append(encoded)
            # data_encode.loc[:,column_name] = feature

        # Drop columns that will be encoded
        # data_rest = self._input_data_copy.drop(self._mapping.keys(), axis=1)
        columns_names = self._input_data_copy.columns.tolist()
        drop_indices = [columns_names.index(col) for col in self._mapping.keys()]
        drop_indices = sorted(drop_indices)

        all_categorical = False
        try:
            self._input_data_copy = utils.remove_columns(self._input_data_copy, drop_indices,
                                                         source='ISI DSBox Data Encoder')
        except ValueError:
            _logger.warning("[warn] All the attributes are categorical!")
            all_categorical = True

        # metadata for columns that are not one hot encoded
        # self._col_index = [self._input_data_copy.columns.get_loc(c) for c in data_rest.columns]
        # data_rest.metadata = utils.select_columns_metadata(self._input_data_copy.metadata, self._col_index)

        # encode data
        # encoded = d3m_DataFrame(pd.get_dummies(data_encode, dummy_na=True, prefix=self._cat_columns, prefix_sep='_',
        #                                        columns=self._cat_columns))
        encoded = d3m_DataFrame(pd.concat(res, axis=1))

        # update metadata for existing columns

        for index in range(len(encoded.columns)):
            old_metadata = dict(encoded.metadata.query((mbase.ALL_ELEMENTS, index)))
            old_metadata["structural_type"] = int
            old_metadata["semantic_types"] = (
                'http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')
            encoded.metadata = encoded.metadata.update((mbase.ALL_ELEMENTS, index), old_metadata)
        ## merge/concat both the dataframes
        if not all_categorical:
            output = utils.horizontal_concat(self._input_data_copy, encoded)
        else:
            output = encoded
        return CallResult(output, True, 1)
    def produce(self,
                *,
                inputs: Input,
                timeout: float = None,
                iterations: int = None) -> CallResult[Output]:
        self._input_data_copy = inputs.copy()
        cols_to_drop = list()

        date_cols = self._mapping.get("date_columns")
        if date_cols:
            cols_to_drop += self._mapping.get("date_columns")
            original_cols = self._get_cols(self._input_data_copy)
            dfo = DateFeaturizerOrg(dataframe=self._input_data_copy)
            df = dfo.featurize_date_columns(date_cols)
            current_cols = self._get_cols(df["df"])

            _logger.info(
                "Date Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df["df"]

        phone_cols = self._mapping.get("phone_columns")
        if phone_cols:
            cols_to_drop += phone_cols.get("columns_to_perform", [])
            original_cols = self._get_cols(self._input_data_copy)
            df = PhoneParser.perform(df=self._input_data_copy,
                                     columns_perform=phone_cols)
            current_cols = self._get_cols(df)

            _logger.info(
                "Phone Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df

        an_cols = self._mapping.get("alpha_numeric_columns")
        if an_cols:
            cols_to_drop += an_cols.get("columns_to_perform", [])
            original_cols = self._get_cols(self._input_data_copy)
            df = NumAlphaParser.perform(df=self._input_data_copy,
                                        columns_perform=an_cols)
            current_cols = self._get_cols(df)

            _logger.info(
                "NumAlpha Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df

        punc_cols = self._mapping.get("punctuation_columns")
        if punc_cols:
            cols_to_drop += punc_cols.get("columns_to_perform", [])
            original_cols = self._get_cols(self._input_data_copy)
            df = PunctuationParser.perform(df=self._input_data_copy,
                                           columns_perform=punc_cols)
            current_cols = self._get_cols(df)

            _logger.info(
                "Punctuation Featurizer. 'created_columns': '%(created_columns)s'.",
                {
                    'created_columns':
                    str(list(set(current_cols).difference(original_cols))),
                },
            )

            self._input_data_copy = df

        if cols_to_drop:
            self._input_data_copy = common_utils.remove_columns(
                self._input_data_copy, list(set(cols_to_drop)))
        self._update_structural_type()

        return CallResult(self._input_data_copy, True, 1)