def offset_to_char(off, lang): """ Applicable to Brahmi derived Indic scripts """ if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) return chr(off + li.SCRIPT_RANGES[lang][0])
def get_phonetic_info(lang): if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) phonetic_data = ALL_PHONETIC_DATA if lang != li.LC_TA else TAMIL_PHONETIC_DATA phonetic_vectors = ALL_PHONETIC_VECTORS if lang != li.LC_TA else TAMIL_PHONETIC_VECTORS return (phonetic_data, phonetic_vectors)
def trivial_detokenize(s, lang='hi'): """ Trivial tokenizer for languages in the Indian sub-continent """ if lang == 'ur': raise IndicNlpException('No detokenizer available for Urdu') else: return trivial_detokenize_indic(s)
def is_indiclang_char(c, lang): """ Applicable to Brahmi derived Indic scripts Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts """ if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) o = get_offset(c, lang) return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \ or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA
def trivial_detokenize(text, lang='hi'): """detokenize string for languages of the Indian subcontinent A trivial detokenizer which: - decides whether punctuation attaches to left/right or both - handles number sequences - handles quotes smartly (deciding left or right attachment) Args: text (str): tokenized text to process Returns: str: detokenized string Raises: IndicNlpException: If language is not supported """ if lang == 'ur': raise IndicNlpException('No detokenizer available for Urdu') else: return trivial_detokenize_indic(text)
def in_coordinated_range(c, lang): if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) return in_coordinated_range_offset(get_offset(c, lang))
def get_offset(c, lang): if not is_supported_language(lang): raise IndicNlpException('Language {} not supported'.format(lang)) return ord(c) - li.SCRIPT_RANGES[lang][0]