def apply_pandas_multiprocessing( X_, func_, args_=None, cols_=None, n_jobs=1, progress_bar=False, **kwargs ): if cols_ is None: X_ = apply_by_multiprocessing( df=X_, func=func_, args=args_, axis=1, workers=n_jobs, progress_bar=progress_bar, ) elif len(cols_) == 1: X_[cols_[0]] = apply_by_multiprocessing( df=X_, func=func_, args=args_, axis=1, workers=n_jobs, progress_bar=progress_bar, ) else: X_[cols_] = apply_by_multiprocessing( df=X_, func=func_, args=args_, axis=1, workers=n_jobs, progress_bar=progress_bar, ).apply(pd.Series) return X_
def transform(self, X): """Apply functions defined in the `function_scheduler` parameter. Parameters ---------- X : pandas.DataFrame, Data on which transformations are applied. Returns ------- pandas.DataFrame """ if self.copy: X_ = X.copy() else: X_ = X for tuple_ in self.functions_scheduler: func_, args_, cols_ = _check_tuple(*tuple_) # cols_ = cols_ or X_.columns if self.mode == 'apply': if cols_ is None: X_ = X_.apply(func_, args=args_, axis=1) elif len(cols_) == 1: X_[cols_[0]] = X_.apply(func_, args=args_, axis=1) else: X_[cols_] = X_.apply(func_, args=args_, axis=1).apply(pd.Series) else: # 'apply_by_multiprocessing' if cols_ is None: X_ = apply_by_multiprocessing( df=X_, func=func_, args=args_, axis=1, workers=self.n_jobs, progress_bar=self.progress_bar) elif len(cols_) == 1: X_[cols_[0]] = apply_by_multiprocessing( df=X_, func=func_, args=args_, axis=1, workers=self.n_jobs, progress_bar=self.progress_bar) else: X_[cols_] = apply_by_multiprocessing( df=X_, func=func_, args=args_, axis=1, workers=self.n_jobs, progress_bar=self.progress_bar).apply(pd.Series) return X_
def transform(self, X): """Returns list of keywords in apparition order for each document with the weighted tf-idf already fitted. Parameters ---------- X : pandas.DataFrame, shape (n_samples, n_features) X must contain ['tokens'] column. Returns ------- X_new : pandas.DataFrame, shape (n_samples, n_components) """ if self.copy: X_ = X.copy() else: X_ = X X_['keywords'] = apply_by_multiprocessing( df=X_[['tokens']], func=self.get_keywords, axis=1, workers=self.n_jobs, progress_bar=self.progress_bar) return X_
def predict(self, X): """ Given the objet has already been fitted, will add a new column "sentiment_score" to the Pandas Dataset containing the polarity scores of the documents towards the list of seeds provided. Parameters ---------- X : DataFrame Input emails DataFrame """ X['sentiment_score'] = apply_by_multiprocessing( X, self.rate_email, workers=self.n_jobs, progress_bar=self.progress_bar) return X
def to_flattoks(self, X): """Create list of list of tokens from a pd.Series Each list of tokens correspond to a sentence. Parameters ---------- X : pd.Dataframe, Returns ------- list of lists of strings """ tokenized_sentences_list = apply_by_multiprocessing(X[self.columns_], self.to_list_of_tokenized_sentences, workers=self.n_jobs ) flattoks = [item for sublist in tokenized_sentences_list for item in sublist] return flattoks
def predict(self, X, return_column='score'): """ Given the objet has already been fitted, will add a new column "score" (or the column name specified as argument) to the Pandas Dataset containing the polarity scores of the documents towards the list of seeds provided. Parameters ---------- X : DataFrame Input emails DataFrame return_column : str Name of the new column added to the DataFrame containing the semantic score """ X[return_column] = apply_by_multiprocessing( X, self.rate_email, workers=self.n_jobs, progress_bar=self.progress_bar) return X
def to_flattoks(self, X): """Create list of list of tokens from a pd.Series Each list of tokens correspond to a sentence. Parameters ---------- X : pd.Dataframe, Returns ------- list of lists of strings """ tokenized_sentences_list = apply_by_multiprocessing(df=X[[self.column_]], func=lambda x: self.to_list_of_tokenized_sentences(x[self.column_]), args=None, workers=self.n_jobs, progress_bar=False ) flattoks = [item for sublist in tokenized_sentences_list for item in sublist] return flattoks