Ejemplo n.º 1
0
    def featurize_dataframe(self, df, col_id, ignore_errors=False,
                            return_errors=False, inplace=True,
                            multiindex=False):
        """
        Featurize dataframe is overloaded in order to allow
        compatibility with Featurizers that overload featurize_dataframe
        """

        if multiindex:
            if not isinstance(df.columns, pd.MultiIndex):
                col_id = ("Input Data", col_id)
            df = homogenize_multiindex(df, "Input Data")

        # Detect if any featurizers override featurize_dataframe
        override = ["featurize_dataframe" in f.__class__.__dict__.keys()
                    for f in self.featurizers]
        if any(override):
            warnings.warn(
                "One or more featurizers overrides featurize_dataframe, "
                "featurization will be sequential and may diminish performance")

        for f in self.featurizers:
            df = f.featurize_dataframe(df, col_id, ignore_errors,
                                       return_errors, inplace, multiindex)

            if multiindex:
                feature_labels = [(f.__class__.__name__, flabel) for flabel in f.feature_labels()]
            else:
                feature_labels = f.feature_labels()
            df[feature_labels] = df[feature_labels].applymap(np.squeeze)
        return df
Ejemplo n.º 2
0
    def featurize_dataframe(self, df, col_id, *args, **kwargs):
        """
        Accepts the same arguments as BaseFeaturizer.featurize_dataframe.
        """
        multiindex = kwargs.get('multiindex', False)

        if multiindex:
            if not isinstance(df.columns, pd.MultiIndex):
                col_id = ("Input Data", col_id)
            df = homogenize_multiindex(df, "Input Data")

        for f in self.featurizers:
            df = f.featurize_dataframe(df, col_id, *args, **kwargs)

            if multiindex:
                feature_labels = [(f.__class__.__name__, flabel)
                                  for flabel in f.feature_labels()]
            else:
                feature_labels = f.feature_labels()
            df[feature_labels] = df[feature_labels].applymap(np.squeeze)
        return df
Ejemplo n.º 3
0
    def featurize_dataframe(self,
                            df,
                            col_id,
                            ignore_errors=False,
                            return_errors=False,
                            inplace=True,
                            multiindex=False,
                            pbar=True):
        """
        Compute features for all entries contained in input dataframe.

        Args:
            df (Pandas dataframe): Dataframe containing input data.
            col_id (str or list of str): column label containing objects to
                featurize. Can be multiple labels if the featurize function
                requires multiple inputs.
            ignore_errors (bool): Returns NaN for dataframe rows where
                exceptions are thrown if True. If False, exceptions
                are thrown as normal.
            return_errors (bool). Returns the errors encountered for each
                row in a separate `XFeaturizer errors` column if True. Requires
                ignore_errors to be True.
            inplace (bool): Whether to add new columns to input dataframe (df)
            multiindex (bool): If True, use a Featurizer - Feature 2-level
                index using the MultiIndex capabilities of pandas. If done
                inplace, multiindex featurization will overwrite the original
                dataframe's column index.
            pbar (bool): Shows a progress bar if True.

        Returns:
            updated dataframe.
        """

        # If only one column and user provided a string, put it inside a list
        if isinstance(col_id, string_types):
            col_id = [col_id]

        # Multiindexing doesn't play nice with other options!
        if multiindex:
            if inplace:
                warnings.warn("Multiindexing enabled with inplace=True! The "
                              "original dataframe index has changed.")

        elif isinstance(df.columns, pd.MultiIndex):
            # If input df is multi, but multi not enabled...
            raise ValueError(
                "Please enable multiindexing to featurize an input"
                " dataframe containing a column multiindex.")

        # Generate the labels for the columns
        labels = self._generate_column_labels(multiindex, return_errors)

        # Check names to avoid overwriting the current columns
        # ConversionFeaturizer have attribute called _overwrite_data which
        # determines whether an Error is thrown
        if not getattr(self, '_overwrite_data', False):
            for col in df.columns.values:
                if col in labels:
                    raise ValueError(
                        '"{}" exists in input dataframe'.format(col))

        # Compute the features
        features = self.featurize_many(df[col_id].values,
                                       ignore_errors=ignore_errors,
                                       return_errors=return_errors,
                                       pbar=pbar)

        # Make sure the dataframe can handle multiindices
        if multiindex:
            df = homogenize_multiindex(df, "Input Data")

        # Create dataframe with the new features
        res = pd.DataFrame(features, index=df.index, columns=labels)

        if inplace:
            # Update the existing dataframe
            for k in labels:
                df[k] = res[k]
            return df
        else:
            # Create new dataframe and ensure columns are ordered properly
            new = pd.concat([df, res], axis=1)
            return new[df.columns.tolist() + res.columns.tolist()]
Ejemplo n.º 4
0
    def featurize_dataframe(self,
                            df,
                            col_id,
                            ignore_errors=False,
                            return_errors=False,
                            inplace=True,
                            multiindex=False):
        """
        Compute features for all entries contained in input dataframe.

        Args:
            df (Pandas dataframe): Dataframe containing input data.
            col_id (str or list of str): column label containing objects to
                featurize. Can be multiple labels if the featurize function
                requires multiple inputs.
            ignore_errors (bool): Returns NaN for dataframe rows where
                exceptions are thrown if True. If False, exceptions
                are thrown as normal.
            return_errors (bool). Returns the errors encountered for each
                row in a separate `XFeaturizer errors` column if True. Requires
                ignore_errors to be True.
            inplace (bool): Whether to add new columns to input dataframe (df)

        Returns:
            updated dataframe.
        """

        # If only one column and user provided a string, put it inside a list
        if isinstance(col_id, string_types):
            col_id = [col_id]

        # Generate the feature labels
        labels = self.feature_labels()

        # Check names to avoid overwriting the current columns
        for col in df.columns.values:
            if col in labels:
                raise ValueError('"{}" exists in input dataframe'.format(col))

        # Compute the features
        features = self.featurize_many(df[col_id].values,
                                       ignore_errors=ignore_errors,
                                       return_errors=return_errors)
        if return_errors:
            labels.append(self.__class__.__name__ + " Exceptions")

        if multiindex:
            indices = ([self.__class__.__name__], labels)
            labels = pd.MultiIndex.from_product(indices)
            df = homogenize_multiindex(df, "Input Data")
        elif isinstance(df.columns, pd.MultiIndex):
            # If input df is multi, but multi not enabled...
            raise ValueError(
                "Please enable multiindexing to featurize an input"
                " dataframe containing a column multiindex.")

        # Create dataframe with the new features
        res = pd.DataFrame(features, index=df.index, columns=labels)

        if inplace:
            # Update the existing dataframe
            for k in labels:
                df[k] = res[k]
            return df
        else:
            # Create new dataframe and ensure columns are ordered properly
            new = pd.concat([df, res], axis=1)
            return new[df.columns.tolist() + res.columns.tolist()]
Ejemplo n.º 5
0
    def featurize_dataframe(self, df, col_id, ignore_errors=False,
                            return_errors=False, inplace=True,
                            multiindex=False, pbar=True):
        """
        Compute features for all entries contained in input dataframe.

        Args:
            df (Pandas dataframe): Dataframe containing input data.
            col_id (str or list of str): column label containing objects to
                featurize. Can be multiple labels if the featurize function
                requires multiple inputs.
            ignore_errors (bool): Returns NaN for dataframe rows where
                exceptions are thrown if True. If False, exceptions
                are thrown as normal.
            return_errors (bool). Returns the errors encountered for each
                row in a separate `XFeaturizer errors` column if True. Requires
                ignore_errors to be True.
            inplace (bool): Whether to add new columns to input dataframe (df)
            multiindex (bool): If True, use a Featurizer - Feature 2-level
                index using the MultiIndex capabilities of pandas. If done
                inplace, multiindex featurization will overwrite the original
                dataframe's column index.
            pbar (bool): Shows a progress bar if True.

        Returns:
            updated dataframe.
        """

        # If only one column and user provided a string, put it inside a list
        if isinstance(col_id, string_types):
            col_id = [col_id]

        # Multiindexing doesn't play nice with other options!
        if multiindex:
            if inplace:
                warnings.warn("Multiindexing enabled with inplace=True! The "
                              "original dataframe index has changed.")

        elif isinstance(df.columns, pd.MultiIndex):
            # If input df is multi, but multi not enabled...
            raise ValueError("Please enable multiindexing to featurize an input"
                             " dataframe containing a column multiindex.")

        # Generate the labels for the columns
        labels = self._generate_column_labels(multiindex, return_errors)

        # Check names to avoid overwriting the current columns
        # ConversionFeaturizer have attribute called _overwrite_data which
        # determines whether an Error is thrown
        if not getattr(self, '_overwrite_data', False):
            for col in df.columns.values:
                if col in labels:
                    raise ValueError(
                        '"{}" exists in input dataframe'.format(col))

        # Compute the features
        features = self.featurize_many(df[col_id].values,
                                       ignore_errors=ignore_errors,
                                       return_errors=return_errors,
                                       pbar=pbar)

        # Make sure the dataframe can handle multiindices
        if multiindex:
            df = homogenize_multiindex(df, "Input Data")

        # Create dataframe with the new features
        res = pd.DataFrame(features, index=df.index, columns=labels)

        if inplace:
            # Update the existing dataframe
            for k in labels:
                df[k] = res[k]
            return df
        else:
            # Create new dataframe and ensure columns are ordered properly
            new = pd.concat([df, res], axis=1)
            return new[df.columns.tolist() + res.columns.tolist()]