Ejemplo n.º 1
0
def validate_processors(config: dict) -> None:
    stage_types = {
        'PRE_PROCESSING': 'file',
        'PROCESSING': 'line',
        'POST_PROCESSING': 'file',
    }

    for stage_name, stage_type in stage_types.items():
        for processor in config[stage_name]:
            params = {}
            processor_name = processor
            if isinstance(processor, dict):
                # HACK: processor with parameters for __init__
                processor_name = list(processor)[0]
                params = processor[processor_name]

            if processor_name not in processors_types.keys():
                raise TDCValueError(
                    f'Processor {processor_name} for stage {stage_name} not found!'
                )

            if processors_types[processor_name] != stage_type:
                msg = f'Processor {processor_name} for stage {stage_name} must be a {stage_type}-typed processor'
                raise TDCValueError(msg)

            # try create processor for check errors in initialization
            try:
                processors_dict[processor_name](**params)
            except TypeError as exc:
                message = str(exc)
                # FIXME: hack for tell processor name
                message = message.replace('__init__()',
                                          f'{processor_name} processor')
                message = f'{message} for __init__ method'
                raise TDCTypeError(message)
Ejemplo n.º 2
0
 def __init__(self, mode: str):
     allowed = ['title', 'lower', 'upper']
     if mode not in allowed:
         raise TDCValueError(
             f'Wrong mode for {self.name} processor: {mode}, allowed only: {allowed}'
         )
     self.mode = mode
Ejemplo n.º 3
0
    def __init__(self, mode: str = 'remove_line', replace_with: str = ' '):
        allowed = ['remove_line', 'replace']
        if mode not in allowed:
            raise TDCValueError(f'Wrong mode for {self.name} processor: {mode}, allowed only: {allowed}')

        self.mode = mode
        self.replace_with = replace_with
Ejemplo n.º 4
0
    def __init__(self, form: str = 'NFKC'):
        allowed = ['NFC', 'NFD', 'NFKC', 'NFKD']
        if form not in allowed:
            raise TDCValueError(
                f'Wrong form for {self.name} processor: {form}, allowed only: {allowed}'
            )

        self.form = form
Ejemplo n.º 5
0
def download_file(url: str, save_path: str) -> None:
    response = requests.get(url, stream=True)

    if not response.ok:
        raise TDCValueError(
            f'Download {url} failed with code {response.status_code}')

    # TODO: log download started / finished
    with open(save_path, 'wb') as fh:
        for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
            fh.write(chunk)
Ejemplo n.º 6
0
def validate_config(config: dict) -> None:
    required_parameters = ['PRE_PROCESSING', 'PROCESSING', 'POST_PROCESSING']
    # required + optional
    parameter_types = {
        'PRE_PROCESSING': list,
        'PROCESSING': list,
        'POST_PROCESSING': list,
        'CACHE_DIR': str,
    }

    for param in required_parameters:
        if param not in config:
            raise TDCValueError(
                f'Missing required configuration parameter: {param}')

    for param_key, param_obj in config.items():
        if param_key not in parameter_types:
            raise TDCValueError(f'Unknown config parameter: {param_key}')

        if not isinstance(param_obj, parameter_types[param_key]):
            raise TDCTypeError(
                f'Configuration parameter {param_key} must be a type of {parameter_types[param_key]}'
            )
    def _get_line_processors(self) -> List[BaseProcessor]:
        processors = []
        for processor_data in self.config['PROCESSING']:
            params = {}
            if isinstance(processor_data, dict):
                # HACK: processor with parameters for __init__
                processor_name = list(processor_data)[0]
                params = processor_data[processor_name]
            elif isinstance(processor_data, str):
                processor_name = processor_data
            else:
                # TODO: own exceptions
                raise TDCValueError(f'Wrong processor: {processor_data}')

            processor = processors_dict[processor_name](**params)
            processors.append(processor)

        return processors
Ejemplo n.º 8
0
    def __init__(self, language_code: str, mode: str, replace_with: str = ' '):
        allowed_language = [
            # https://github.com/6/stopwords-json/tree/master/dist
            # Run in Dev Browser Console:
            # var l = '';
            # $x("//a[starts-with(@href, '/6/stopwords-json/blob/master/dist/')]/@href").forEach(function(el) {
            #   var code = el.textContent.replace('/6/stopwords-json/blob/master/dist/', '').replace('.json', '');
            #   languages = languages + "'" + code + "',\n";
            # });
            # console.log(languages);
            'af',
            'ar',
            'bg',
            'bn',
            'br',
            'ca',
            'cs',
            'da',
            'de',
            'el',
            'en',
            'eo',
            'es',
            'et',
            'eu',
            'fa',
            'fi',
            'fr',
            'ga',
            'gl',
            'ha',
            'he',
            'hi',
            'hr',
            'hu',
            'hy',
            'id',
            'it',
            'ja',
            'ko',
            'la',
            'lv',
            'mr',
            'nl',
            'no',
            'pl',
            'pt',
            'ro',
            'ru',
            'sk',
            'sl',
            'so',
            'st',
            'sv',
            'sw',
            'th',
            'tr',
            'yo',
            'zh',
            'zu',
        ]
        if language_code not in allowed_language:
            msg = f'Wrong language for {self.name} processor: {language_code}, allowed only: {allowed_language}'
            raise TDCValueError(msg)
        self.language_code = language_code

        url = f'https://raw.githubusercontent.com/6/stopwords-json/master/dist/{self.language_code}.json'
        temp_file = get_temp_file_path()

        # FIXME: write & read? Better download to variable
        download_file(url, temp_file)
        with open(temp_file, encoding='utf-8') as fd:
            stop_words = fd.read()

        stop_words = json.loads(stop_words)
        stop_words_uniq = set(word.replace('|', '') for word in stop_words)
        stop_words = '|'.join(stop_words_uniq)
        stop_words_regex = rf'\b({stop_words})\b'
        self.stop_words_re = re.compile(stop_words_regex,
                                        flags=re.UNICODE | re.IGNORECASE)

        allowed = ['remove_line', 'replace']
        if mode not in allowed:
            raise TDCValueError(
                f'Wrong mode for {self.name} processor: {mode}, allowed only: {allowed}'
            )

        self.mode = mode
        self.replace_with = replace_with