def __init__(self, language="en", clean=False, doc_type=None, char_span=False): """Segments a text into an list of sentences with or withour character offsets from original text Parameters ---------- language : str, required specify a language use its two character ISO 639-1 code, by default "en" clean : bool, optional cleans original text, by default False doc_type : [type], optional Normal text or OCRed text, by default None set to `pdf` for OCRed text char_span : bool, optional Get start & end character offsets of each sentences within original text, by default False """ self.language = language self.language_module = Language.get_language_code(language) self.clean = clean self.doc_type = doc_type self.char_span = char_span
def __init__(self, language="en", clean=False, doc_type=None, char_span=False): """Segments a text into an list of sentences with or withour character offsets from original text Parameters ---------- language : str, required specify a language use its two character ISO 639-1 code, by default "en" clean : bool, optional cleans original text, by default False doc_type : [type], optional Normal text or OCRed text, by default None set to `pdf` for OCRed text char_span : bool, optional Get start & end character offsets of each sentences within original text, by default False """ self.language = language self.language_module = Language.get_language_code(language) self.clean = clean self.doc_type = doc_type self.char_span = char_span if self.clean and self.char_span: raise ValueError("char_span must be False if clean is True. " "Since `clean=True` will modify original text.") # when doctype is pdf then force user to clean the text # char_span func wont be provided with pdf doctype also elif self.doc_type == 'pdf' and not self.clean: raise ValueError("`doc_type='pdf'` should have `clean=True` & " "`char_span` should be False since original" "text will be modified.")
def test_exception_on_no_lang_code_provided(): with pytest.raises(ValueError) as e: Language.get_language_code('') assert "Provide valid language ID i.e. ISO code." in str(e.value)
def test_lang_code2instance_mapping(): for code, language_module in LANGUAGE_CODES.items(): assert Language.get_language_code(code) == language_module