コード例 #1
0
def offset_to_char(off, lang):
    """
    Applicable to Brahmi derived Indic scripts 
    """
    if not is_supported_language(lang):
        raise IndicNlpException('Language {}  not supported'.format(lang))
    return chr(off + li.SCRIPT_RANGES[lang][0])
コード例 #2
0
def get_phonetic_info(lang):
    if not is_supported_language(lang):
        raise IndicNlpException('Language {}  not supported'.format(lang))
    phonetic_data = ALL_PHONETIC_DATA if lang != li.LC_TA else TAMIL_PHONETIC_DATA
    phonetic_vectors = ALL_PHONETIC_VECTORS if lang != li.LC_TA else TAMIL_PHONETIC_VECTORS

    return (phonetic_data, phonetic_vectors)
コード例 #3
0
def trivial_detokenize(s, lang='hi'):
    """
    Trivial tokenizer for languages in the Indian sub-continent
    """
    if lang == 'ur':
        raise IndicNlpException('No detokenizer available for Urdu')
    else:
        return trivial_detokenize_indic(s)
コード例 #4
0
def is_indiclang_char(c, lang):
    """
    Applicable to Brahmi derived Indic scripts 
    Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts 
    """
    if not is_supported_language(lang):
        raise IndicNlpException('Language {}  not supported'.format(lang))
    o = get_offset(c, lang)
    return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \
            or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA
コード例 #5
0
def trivial_detokenize(text, lang='hi'):
    """detokenize string for languages of the Indian subcontinent 

    A trivial detokenizer which:

        - decides whether punctuation attaches to left/right or both
        - handles number sequences
        - handles quotes smartly (deciding left or right attachment)

    Args:
        text (str): tokenized text to process 

    Returns:
        str: detokenized string

    Raises:
        IndicNlpException: If language is not supported        
    """
    if lang == 'ur':
        raise IndicNlpException('No detokenizer available for Urdu')
    else:
        return trivial_detokenize_indic(text)
コード例 #6
0
def in_coordinated_range(c, lang):
    if not is_supported_language(lang):
        raise IndicNlpException('Language {}  not supported'.format(lang))
    return in_coordinated_range_offset(get_offset(c, lang))
コード例 #7
0
def get_offset(c, lang):
    if not is_supported_language(lang):
        raise IndicNlpException('Language {}  not supported'.format(lang))
    return ord(c) - li.SCRIPT_RANGES[lang][0]