def apply_pandas_multiprocessing(
     X_, func_, args_=None, cols_=None, n_jobs=1, progress_bar=False, **kwargs
 ):
     if cols_ is None:
         X_ = apply_by_multiprocessing(
             df=X_,
             func=func_,
             args=args_,
             axis=1,
             workers=n_jobs,
             progress_bar=progress_bar,
         )
     elif len(cols_) == 1:
         X_[cols_[0]] = apply_by_multiprocessing(
             df=X_,
             func=func_,
             args=args_,
             axis=1,
             workers=n_jobs,
             progress_bar=progress_bar,
         )
     else:
         X_[cols_] = apply_by_multiprocessing(
             df=X_,
             func=func_,
             args=args_,
             axis=1,
             workers=n_jobs,
             progress_bar=progress_bar,
         ).apply(pd.Series)
     return X_
    def transform(self, X):
        """Apply functions defined in the `function_scheduler` parameter.

        Parameters
        ----------
        X : pandas.DataFrame,
            Data on which transformations are applied.

        Returns
        -------
        pandas.DataFrame
        """
        if self.copy:
            X_ = X.copy()
        else:
            X_ = X

        for tuple_ in self.functions_scheduler:
            func_, args_, cols_ = _check_tuple(*tuple_)
            # cols_ = cols_ or X_.columns

            if self.mode == 'apply':
                if cols_ is None:
                    X_ = X_.apply(func_, args=args_, axis=1)
                elif len(cols_) == 1:
                    X_[cols_[0]] = X_.apply(func_, args=args_, axis=1)
                else:
                    X_[cols_] = X_.apply(func_, args=args_,
                                         axis=1).apply(pd.Series)
            else:  # 'apply_by_multiprocessing'
                if cols_ is None:
                    X_ = apply_by_multiprocessing(
                        df=X_,
                        func=func_,
                        args=args_,
                        axis=1,
                        workers=self.n_jobs,
                        progress_bar=self.progress_bar)
                elif len(cols_) == 1:
                    X_[cols_[0]] = apply_by_multiprocessing(
                        df=X_,
                        func=func_,
                        args=args_,
                        axis=1,
                        workers=self.n_jobs,
                        progress_bar=self.progress_bar)
                else:
                    X_[cols_] = apply_by_multiprocessing(
                        df=X_,
                        func=func_,
                        args=args_,
                        axis=1,
                        workers=self.n_jobs,
                        progress_bar=self.progress_bar).apply(pd.Series)
        return X_
Beispiel #3
0
    def transform(self, X):
        """Returns list of keywords in apparition order for each document
        with the weighted tf-idf already fitted.

        Parameters
        ----------
        X : pandas.DataFrame, shape (n_samples, n_features)
            X must contain ['tokens'] column.

        Returns
        -------
        X_new : pandas.DataFrame, shape (n_samples, n_components)
        """
        if self.copy:
            X_ = X.copy()
        else:
            X_ = X

        X_['keywords'] = apply_by_multiprocessing(
            df=X_[['tokens']],
            func=self.get_keywords,
            axis=1,
            workers=self.n_jobs,
            progress_bar=self.progress_bar)

        return X_
Beispiel #4
0
    def predict(self, X):
        """
        Given the objet has already been fitted, will add a new column "sentiment_score" to the Pandas Dataset containing the polarity scores of the documents towards the list of seeds provided.
        Parameters
        ----------
        X : DataFrame
            Input emails DataFrame

        """
        X['sentiment_score'] = apply_by_multiprocessing(
            X,
            self.rate_email,
            workers=self.n_jobs,
            progress_bar=self.progress_bar)

        return X
Beispiel #5
0
    def to_flattoks(self, X):
        """Create list of list of tokens from a pd.Series
        Each list of tokens correspond to a sentence.

        Parameters
        ----------
        X : pd.Dataframe,

        Returns
        -------
        list of lists of strings
        """
        tokenized_sentences_list = apply_by_multiprocessing(X[self.columns_],
                                                            self.to_list_of_tokenized_sentences,
                                                            workers=self.n_jobs
                                                            )
        flattoks = [item for sublist in tokenized_sentences_list
                    for item in sublist]
        return flattoks
Beispiel #6
0
    def predict(self, X, return_column='score'):
        """
        Given the objet has already been fitted, will add a new column "score"
        (or the column name specified as argument) to the Pandas Dataset containing the polarity scores of the
        documents towards the list of seeds provided.
        Parameters
        ----------
        X : DataFrame
            Input emails DataFrame
        return_column : str
            Name of the new column added to the DataFrame containing the semantic score

        """
        X[return_column] = apply_by_multiprocessing(
            X,
            self.rate_email,
            workers=self.n_jobs,
            progress_bar=self.progress_bar)

        return X
Beispiel #7
0
    def to_flattoks(self, X):
        """Create list of list of tokens from a pd.Series
        Each list of tokens correspond to a sentence.

        Parameters
        ----------
        X : pd.Dataframe,

        Returns
        -------
        list of lists of strings
        """
        tokenized_sentences_list = apply_by_multiprocessing(df=X[[self.column_]],
                                                            func=lambda x: self.to_list_of_tokenized_sentences(x[self.column_]),
                                                            args=None,
                                                            workers=self.n_jobs,
                                                            progress_bar=False
                                                            )
        flattoks = [item for sublist in tokenized_sentences_list
                    for item in sublist]
        return flattoks