def _default_processor_units(cls) -> list: """Prepare needed process units.""" return [ processor_units.TokenizeUnit(), processor_units.LowercaseUnit(), processor_units.PuncRemovalUnit(), processor_units.StopRemovalUnit(), ]
def __init__(self, fixed_length_left: int = 30, fixed_length_right: int = 30, filter_mode: str = 'df', filter_low_freq: float = 2, filter_high_freq: float = float('inf'), remove_stop_words: bool = False): """Initialization.""" super().__init__() self._fixed_length_left = fixed_length_left self._fixed_length_right = fixed_length_right self._left_fixedlength_unit = processor_units.FixedLengthUnit( self._fixed_length_left, pad_mode='post') self._right_fixedlength_unit = processor_units.FixedLengthUnit( self._fixed_length_right, pad_mode='post') self._filter_unit = processor_units.FrequencyFilterUnit( low=filter_low_freq, high=filter_high_freq, mode=filter_mode) self._default_units = self._default_processor_units() if remove_stop_words: self._default_units.append(processor_units.StopRemovalUnit())