class Application: """ Text cleaner application, starts by initializing object with cleaning utils. :param text: Text to clean :type text: str """ def __init__(self, text): self.text = text self.logger = logging.getLogger(Application.__name__) self.clean_utils = CleanUtils() def run(self) -> None: """ Executes text cleaning process. """ self.clean_text() self.logger.info(f'Cleaned text | {self.text}') def remove_characters_and_hide_numbers(self) -> str: """ Calls removing special characters and hiding numbers procedures using cleaning utils. * Removed characters includes html and non text chars. * Hidden numbers includes different number formats, dates and time. :return: Text without special characters and with hidden numbers :rtype: str """ removed_characters_text = self.clean_utils.remove_characters_for_text( text=self.text) return self.clean_utils.hide_numbers(text=removed_characters_text) def lemmatize_text(self) -> None: """ Calls text lemmatization procedure using cleaning utils. """ self.text = self.clean_utils.lemmatize(text=self.text) def clean_text(self) -> None: """ Calls cleaning operations on given text. """ self.text = self.remove_characters_and_hide_numbers() self.lemmatize_text()
def remove_characters_and_hide_numbers(clean_utils: CleanUtils, text: str) -> str: """ Calls removing special characters and hiding numbers procedures using cleaning utils. * Removed characters includes html and non text chars. * Hidden numbers includes different number formats, dates and time. :param clean_utils: Cleaning utility class :type clean_utils: object :param text: Text to remove special characters and hide numbers :type text: str :return: Text without special characters and with hidden numbers :rtype: str """ removed_character_text = clean_utils.remove_characters_for_text( text=text) return clean_utils.hide_numbers(text=removed_character_text)
def __init__(self, text: str, config_path: str = DEFAULT_PREDICT_CONFIG_PATH): self.logger = logging.getLogger(Application.__name__) self.config = read_config(config_path, self.logger) self.text = text self.clean_utils = CleanUtils() self.json_reader = JsonReader() self.vocab_config = VocabConfig() self.model = None
def hide_numbers_for_column(column_data: pd.Series) -> pd.Series: """ Calls hiding numbers from clean utils for single column in data frame using cleaning utils. :param column_data: Column in data frame to clean. :type column_data: pd.Series :return: Cleaned data frame column :rtype: pd.Series """ return pd.Series( [CleanUtils.hide_numbers(text=text) for text in column_data])
def lemmatize_text(clean_utils: CleanUtils, text: str) -> str: """ Calls text lemmatization procedure using cleaning utils :param clean_utils: Cleaning utility class :type clean_utils: object :param text: Text to lemmatize :type text: str :return: Lemmatized text :rtype: str """ return clean_utils.lemmatize(text=text)
def remove_characters_for_column(column_data: pd.Series) -> pd.Series: """ Calls removal of unwanted characters in data frame column using cleaning utils. :param column_data: Column in data frame to remove characters from. :type column_data: pd.Series :return: Data frame column with removed characters :rtype: pd.Series """ return pd.Series([ CleanUtils.remove_characters_for_text(text=text) for text in column_data ])
def lemmatize_text_for_column(column_data: pd.Series, clean_utils: CleanUtils) -> pd.Series: """ Calls text lemmatization on single data frame column using cleaning utils. Method uses ``progress_map`` to visualize progress of lemmatization using tqdm. :param column_data: Column in data frame to lemmatize. :type column_data: pd.Series :param clean_utils: Cleaning utility class :type clean_utils: object :return: Column in data frame with lemmatized text :rtype: pd.Series """ return column_data.progress_map( lambda text: clean_utils.lemmatize(text=text))
def __init__(self, text): self.text = text self.logger = logging.getLogger(Application.__name__) self.clean_utils = CleanUtils()
def __init__(self, config: Dict[str, Any], data: pd.DataFrame): self.logger = logging.getLogger(Cleaner.__name__) self.config = config self.data = data self.n_cores = cpu_count() // 2 self.clean_utils = CleanUtils()
class Cleaner: """ Cleans raw text data frame obtaining data in unified format. * Removes unwanted characters * Hides numbers, date and time in different formats * Lemmatizes text :param config: Configuration dictionary :type config: dict :param data: Raw text data frame to clean :type data: pd.DataFrame """ def __init__(self, config: Dict[str, Any], data: pd.DataFrame): self.logger = logging.getLogger(Cleaner.__name__) self.config = config self.data = data self.n_cores = cpu_count() // 2 self.clean_utils = CleanUtils() @timeit def clean_dataframe(self) -> pd.DataFrame: """ Executes data frame cleaning process. :return: Cleaned data frame :rtype: pd.DataFrame """ self.convert_list_to_text_in_dataframe() self.remove_characters_for_dataframe() if self.config['hide_numbers']: self.hide_numbers() if self.config['lemmatize']: self.lemmatize_text() return self.data def convert_list_to_text_in_dataframe(self) -> None: """ Calls conversion of data in list of sentences format to single, multi sentenced text using cleaning utils. """ for column_name in self.data: self.data[column_name] = [ self.clean_utils.convert_list_to_text(text_as_list=single_cell) for single_cell in self.data[column_name] ] def hide_numbers(self) -> None: """ Splits hiding numbers in data frame to separate columns. """ for column_name in self.data: self.data[column_name] = self.hide_numbers_for_column( self.data[column_name]) def lemmatize_text(self) -> None: """ Applies parallelization of text lemmatization for data frame using python multiprocessing. Lemmatization process is computationally expensive and thus parallelization greatly reduces the required time. """ self.clean_utils.lang_model = self.config['language_model'] dataframe_splits = np.array_split(self.data, self.n_cores) pool = Pool(self.n_cores) self.data = pd.concat( pool.map(self.lemmatize_text_for_dataframe, dataframe_splits)) pool.close() pool.join() def lemmatize_text_for_dataframe(self, dataframe: pd.DataFrame) -> pd.DataFrame: """ Splits text lemmatization in data frame into separated columns. :param dataframe: Data frame to lemmatize text in. :type dataframe: pd.DataFrame :return: Data frame with lemmatized text :rtype: pd.DataFrame """ for column_name in dataframe: dataframe[column_name] = self.lemmatize_text_for_column( column_data=dataframe[column_name], clean_utils=self.clean_utils, ) return dataframe def remove_characters_for_dataframe(self) -> None: """ Splits unwanted characters removal from data frame to separate columns. """ for column_name in self.data: self.data[column_name] = self.remove_characters_for_column( self.data[column_name]) @staticmethod def hide_numbers_for_column(column_data: pd.Series) -> pd.Series: """ Calls hiding numbers from clean utils for single column in data frame using cleaning utils. :param column_data: Column in data frame to clean. :type column_data: pd.Series :return: Cleaned data frame column :rtype: pd.Series """ return pd.Series( [CleanUtils.hide_numbers(text=text) for text in column_data]) @staticmethod def lemmatize_text_for_column(column_data: pd.Series, clean_utils: CleanUtils) -> pd.Series: """ Calls text lemmatization on single data frame column using cleaning utils. Method uses ``progress_map`` to visualize progress of lemmatization using tqdm. :param column_data: Column in data frame to lemmatize. :type column_data: pd.Series :param clean_utils: Cleaning utility class :type clean_utils: object :return: Column in data frame with lemmatized text :rtype: pd.Series """ return column_data.progress_map( lambda text: clean_utils.lemmatize(text=text)) @staticmethod def remove_characters_for_column(column_data: pd.Series) -> pd.Series: """ Calls removal of unwanted characters in data frame column using cleaning utils. :param column_data: Column in data frame to remove characters from. :type column_data: pd.Series :return: Data frame column with removed characters :rtype: pd.Series """ return pd.Series([ CleanUtils.remove_characters_for_text(text=text) for text in column_data ])