def _init_to_nasal_consonants(self):
        """
        `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
        """

        pat_signatures=\
            [
                 [0x19,0x15,0x18],
                 [0x1e,0x1a,0x1d],
                 [0x23,0x1f,0x22],
                 [0x28,0x24,0x27],
                 [0x29,0x24,0x27],
                 [0x2e,0x2a,0x2d],
            ]

        halant_offset = 0x4d
        anusvaara_offset = 0x02

        pats = []
        repl_strings = []

        for pat_signature in pat_signatures:
            pat = re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format(
                anusvaara=langinfo.offset_to_char(anusvaara_offset, self.lang),
                start_r=langinfo.offset_to_char(pat_signature[1], self.lang),
                end_r=langinfo.offset_to_char(pat_signature[2], self.lang),
            ))
            pats.append(pat)
            repl_string = '{nasal}{halant}\\1'.format(
                nasal=langinfo.offset_to_char(pat_signature[0], self.lang),
                halant=langinfo.offset_to_char(halant_offset, self.lang),
            )
            repl_strings.append(repl_string)

        self.pats_repls = list(zip(pats, repl_strings))
    def _generate_indic_hyperparams(self, params):

        src = params['src']
        tgt = params['tgt']

        ###### add bigram parameters dynamically

        ### add new mappings
        #for e_id, e_sym in self._translit_model.e_id_sym_map.iteritems():
        #    offset= langinfo.get_offset(e_sym,tgt)
        #    if offset >=langinfo.COORDINATED_RANGE_START_INCLUSIVE and offset <= langinfo.COORDINATED_RANGE_END_INCLUSIVE:
        #        f_sym_x=UnicodeIndicTransliterator.transliterate(e_sym,tgt,src)
        #        if f_sym_x in self._translit_model.f_sym_id_map:

        #            ## add 1-2 mappings for consonants
        #            f_offset=langinfo.get_offset(f_sym_x,src)
        #            if f_offset >=0x15 and f_offset <= 0x39:  ## if consonant
        #                # consonant with aa ki maatra
        #                f_with_aa=f_sym_x+langinfo.offset_to_char(0x3e,src)
        #                self._translit_model.add_f_sym(f_with_aa)

        #                # consonant with halant
        #                f_with_halant=f_sym_x+langinfo.offset_to_char(0x4d,src)
        #                self._translit_model.add_f_sym(f_with_halant)

        ## initialize hyperparams
        alpha = np.ones((len(self._translit_model.e_sym_id_map),
                         len(self._translit_model.f_sym_id_map)
                         )) * params['scale_factor_mapping_exists']

        for e_id, e_sym in self._translit_model.e_id_sym_map.iteritems():
            offset = langinfo.get_offset(e_sym, tgt)
            if offset >= langinfo.COORDINATED_RANGE_START_INCLUSIVE and offset <= langinfo.COORDINATED_RANGE_END_INCLUSIVE:
                f_sym_x = UnicodeIndicTransliterator.transliterate(
                    e_sym, tgt, src)
                if f_sym_x in self._translit_model.f_sym_id_map:
                    alpha[e_id,
                          self._translit_model.f_sym_id_map[f_sym_x]] = params[
                              'base_measure_mapping_exists']

                    ## add 1-2 mappings for consonants
                    f_offset = langinfo.get_offset(f_sym_x, src)
                    if f_offset >= 0x15 and f_offset <= 0x39:  ## if consonant
                        # consonant with aa ki maatra
                        f_with_aa = f_sym_x + langinfo.offset_to_char(
                            0x3e, src)
                        if f_with_aa in self._translit_model.f_sym_id_map:
                            alpha[e_id, self._translit_model.
                                  f_sym_id_map[f_with_aa]] = params[
                                      'base_measure_mapping_exists']

                        # consonant with halant
                        f_with_halant = f_sym_x + langinfo.offset_to_char(
                            0x4d, src)
                        if f_with_halant in self._translit_model.f_sym_id_map:
                            alpha[e_id, self._translit_model.
                                  f_sym_id_map[f_with_halant]] = params[
                                      'base_measure_mapping_exists']

        return alpha
    def _init_to_anusvaara_strict(self):
        """
        `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
        """
    
        pat_signatures=\
            [
                 [0x19,0x15,0x18],
                 [0x1e,0x1a,0x1d],            
                 [0x23,0x1f,0x22],                        
                 [0x28,0x24,0x27],        
                 [0x29,0x24,0x27],                    
                 [0x2e,0x2a,0x2d],                    
            ]    
        
        halant_offset=0x4d
        anusvaara_offset=0x02
        
        pats=[]
        
        for pat_signature in pat_signatures:
            pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format(
                nasal=langinfo.offset_to_char(pat_signature[0],self.lang),
                halant=langinfo.offset_to_char(halant_offset,self.lang),
                start_r=langinfo.offset_to_char(pat_signature[1],self.lang),
                end_r=langinfo.offset_to_char(pat_signature[2],self.lang),
            ))
            pats.append(pat)
        
        repl_string='{anusvaara}\\1'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))

        self.pats_repls=(pats,repl_string)
def compare_character_ratios(mono_datadir): 

    ## read language data 
    lang_data=[]    
    for lang in langs: 
        with codecs.open('{}/{}.yaml'.format(mono_datadir,lang),'r','utf-8') as datafile:
            lang_data.append(yaml.load(datafile))

    ### Plot character ratios
    charratio_mat=np.zeros((len(langs),langinfo.COORDINATED_RANGE_END_INCLUSIVE-langinfo.COORDINATED_RANGE_START_INCLUSIVE+1))
    for i,lang in enumerate(langs): 
        for c,v in lang_data[i]['char_proportions'].iteritems(): 
            charratio_mat[i,c]=v

    ## plot 
    matplotlib.rc('font', family='Lohit Hindi')
    fig, ax = plt.subplots()

    plt.pcolor(charratio_mat,cmap=plt.cm.hot_r,edgecolors='k')
    plt.colorbar()

    plt.xticks(np.arange(0,charratio_mat.shape[1])+0.5,[ langinfo.offset_to_char(o,'hi') for o in xrange(langinfo.COORDINATED_RANGE_START_INCLUSIVE,langinfo.COORDINATED_RANGE_END_INCLUSIVE+1)])
    plt.yticks(np.arange(0,charratio_mat.shape[0])+0.5,xrange(len(langs)))
    
    plt.show()
    plt.close()
    def _init_normalize_chandras(self):

        substitution_offsets =\
            [
                [0x0d , 0x0f], # chandra e, independent
                [0x11 , 0x13], # chandra o, independent
                [0x45 , 0x47], # chandra e , 0xde],pendent
                [0x49 , 0x4b], # chandra o , 0xde],pendent
                # [0x72 , 0x0f], # mr: chandra e, independent

                [0x00 , 0x02], # chandrabindu
                [0x01 , 0x02], # chandrabindu
            ]

        self.chandra_substitutions = [
            (langinfo.offset_to_char(x[0], self.lang),
             langinfo.offset_to_char(x[1], self.lang))
            for x in substitution_offsets
        ]
    def _init_to_anusvaara_relaxed(self):
        """
        `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
        """
            
        nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e]    
        nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list])
        
        halant_offset=0x4d    
        anusvaara_offset=0x02    
        
        pat=re.compile(r'[{nasals_list_str}]{halant}'.format(
                nasals_list_str=nasals_list_str,
                halant=langinfo.offset_to_char(halant_offset,self.lang),
            ))
        
        repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))

        self.pats_repls = (pat,repl_string)
    def _init_to_anusvaara_relaxed(self):
        """
        `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')`
        """
            
        nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e]    
        nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list])
        
        halant_offset=0x4d    
        anusvaara_offset=0x02    
        
        pat=re.compile(r'[{nasals_list_str}]{halant}'.format(
                nasals_list_str=nasals_list_str,
                halant=langinfo.offset_to_char(halant_offset,self.lang),
            ))
        
        repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang))

        self.pats_repls = (pat,repl_string)
 def _normalize_word_vowel_ending_dravidian(self, word):
     """
     for Dravidian
     - consonant ending: add 'a' ki maatra
     - halant ending: no change
     - 'a' ki maatra: no change
     """
     if len(word) > 0 and langinfo.is_consonant(word[-1], self.lang):
         return word + langinfo.offset_to_char(0x3e, self.lang)
     else:
         return word
 def _normalize_word_vowel_ending_ie(self, word):
     """
     for IE
     - consonant ending: add halant
     - halant ending: no change
     - 'a' ki maatra: no change
     """
     if len(word) > 0 and langinfo.is_consonant(word[-1], self.lang):
         return word + langinfo.offset_to_char(langinfo.HALANTA_OFFSET,
                                               self.lang)
     else:
         return word
Beispiel #10
0
    def from_itrans(text, lang):
        """
        TODO: Document this method properly
        TODO: A little hack is used to handle schwa: needs to be documented
        TODO: check for robustness
        """

        MAXCODE = 4  ### TODO: Needs to be fixed

        ##  handle_duplicate_itrans_representations
        for k, v in DUPLICATE_ITRANS_REPRESENTATIONS.items():
            if k in text:
                text = text.replace(k, v)

        start = 0
        match = None
        solution = []

        i = start + 1
        while i <= len(text):

            itrans = text[start:i]

            #         print('===')
            #         print('i: {}'.format(i))
            #         if i<len(text):
            #             print('c: {}'.format(text[i-1]))
            #         print('start: {}'.format(start))
            #         print('itrans: {}'.format(itrans))

            if itrans in ITRANS_TO_OFFSET:
                offs = ITRANS_TO_OFFSET[itrans]

                ## single element list - no problem
                ## except when it is 'a'

                ## 2 element list of 2 kinds:
                ### 1. alternate char for independent/dependent vowel
                ### 2. consonant + halant

                if len(offs)==2 and \
                    langinfo.is_vowel_offset(offs[0]):
                    ### 1. alternate char for independent/dependent vowel
                    ## if previous is a consonant, then use the dependent vowel
                    if len(solution) > 0 and langinfo.is_halanta(
                            solution[-1], lang):
                        offs = [offs[1]]  ## dependent vowel
                    else:
                        offs = [offs[0]]  ## independent vowel

                c = ''.join([langinfo.offset_to_char(x, lang) for x in offs])
                match = (i, c)

            elif len(itrans) == 1:  ## unknown character
                match = (i, itrans)
            elif i < len(text) and (
                    i - start
            ) < MAXCODE + 1:  ## continue matching till MAXCODE length substring
                i = i + 1
                continue
            else:
                solution.extend(match[1])
                #             start=i-1
                start = match[0]
                i = start
                match = None
    #             print('match done')

    #         print('match: {}'.format(match))

            i = i + 1

        ### flush matches
        if match is not None:
            solution.extend(match[1])

        #### post-processing
        ## delete unecessary halants
    #     print(''.join(solution))
        temp_out = list(''.join(solution))
        rem_indices = []
        for i in range(len(temp_out) - 1):
            if langinfo.is_halanta(temp_out[i],lang) and \
                (langinfo.is_vowel_sign(temp_out[i+1],lang) \
                or langinfo.is_nukta(temp_out[i+1],lang)  \
                or temp_out[i+1]==langinfo.offset_to_char(0x7f,lang)):
                rem_indices.append(i)

    #         if temp_out[i]==langinfo.offset_to_char(0x7f,lang):
    #             rem_indices.append(i)
        for i in reversed(rem_indices):
            temp_out.pop(i)

        out = ''.join(temp_out)

        ## delete schwa placeholder
        out = out.replace(langinfo.offset_to_char(0x7f, lang), '')

        return out
import sys
from indicnlp import langinfo
from indicnlp import loader

if __name__ == '__main__':
    """
        This script corrects the incorrect tokenization done by Moses tokenizer.
        The Moses tokenizer splits on nukta and halant characters
        Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode>
    """

    loader.load()

    infname = sys.argv[1]
    outfname = sys.argv[2]
    lang = sys.argv[3]

    halant_char = langinfo.offset_to_char(langinfo.HALANTA_OFFSET, lang)
    nukta_char = langinfo.offset_to_char(langinfo.NUKTA_OFFSET, lang)

    with open(infname,'r',encoding='utf-8') as infile, \
         open(outfname,'w',encoding='utf-8') as outfile:
        for line in infile:
            outfile.write(
                line.replace(' {} '.format(halant_char), halant_char).replace(
                    ' {} '.format(nukta_char), nukta_char).replace(
                        ' {}{}'.format(nukta_char, halant_char),
                        '{}{}'.format(nukta_char, halant_char)))