def _init_to_nasal_consonants(self): """ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')` """ pat_signatures=\ [ [0x19,0x15,0x18], [0x1e,0x1a,0x1d], [0x23,0x1f,0x22], [0x28,0x24,0x27], [0x29,0x24,0x27], [0x2e,0x2a,0x2d], ] halant_offset = 0x4d anusvaara_offset = 0x02 pats = [] repl_strings = [] for pat_signature in pat_signatures: pat = re.compile(r'{anusvaara}([{start_r}-{end_r}])'.format( anusvaara=langinfo.offset_to_char(anusvaara_offset, self.lang), start_r=langinfo.offset_to_char(pat_signature[1], self.lang), end_r=langinfo.offset_to_char(pat_signature[2], self.lang), )) pats.append(pat) repl_string = '{nasal}{halant}\\1'.format( nasal=langinfo.offset_to_char(pat_signature[0], self.lang), halant=langinfo.offset_to_char(halant_offset, self.lang), ) repl_strings.append(repl_string) self.pats_repls = list(zip(pats, repl_strings))
def _generate_indic_hyperparams(self, params): src = params['src'] tgt = params['tgt'] ###### add bigram parameters dynamically ### add new mappings #for e_id, e_sym in self._translit_model.e_id_sym_map.iteritems(): # offset= langinfo.get_offset(e_sym,tgt) # if offset >=langinfo.COORDINATED_RANGE_START_INCLUSIVE and offset <= langinfo.COORDINATED_RANGE_END_INCLUSIVE: # f_sym_x=UnicodeIndicTransliterator.transliterate(e_sym,tgt,src) # if f_sym_x in self._translit_model.f_sym_id_map: # ## add 1-2 mappings for consonants # f_offset=langinfo.get_offset(f_sym_x,src) # if f_offset >=0x15 and f_offset <= 0x39: ## if consonant # # consonant with aa ki maatra # f_with_aa=f_sym_x+langinfo.offset_to_char(0x3e,src) # self._translit_model.add_f_sym(f_with_aa) # # consonant with halant # f_with_halant=f_sym_x+langinfo.offset_to_char(0x4d,src) # self._translit_model.add_f_sym(f_with_halant) ## initialize hyperparams alpha = np.ones((len(self._translit_model.e_sym_id_map), len(self._translit_model.f_sym_id_map) )) * params['scale_factor_mapping_exists'] for e_id, e_sym in self._translit_model.e_id_sym_map.iteritems(): offset = langinfo.get_offset(e_sym, tgt) if offset >= langinfo.COORDINATED_RANGE_START_INCLUSIVE and offset <= langinfo.COORDINATED_RANGE_END_INCLUSIVE: f_sym_x = UnicodeIndicTransliterator.transliterate( e_sym, tgt, src) if f_sym_x in self._translit_model.f_sym_id_map: alpha[e_id, self._translit_model.f_sym_id_map[f_sym_x]] = params[ 'base_measure_mapping_exists'] ## add 1-2 mappings for consonants f_offset = langinfo.get_offset(f_sym_x, src) if f_offset >= 0x15 and f_offset <= 0x39: ## if consonant # consonant with aa ki maatra f_with_aa = f_sym_x + langinfo.offset_to_char( 0x3e, src) if f_with_aa in self._translit_model.f_sym_id_map: alpha[e_id, self._translit_model. f_sym_id_map[f_with_aa]] = params[ 'base_measure_mapping_exists'] # consonant with halant f_with_halant = f_sym_x + langinfo.offset_to_char( 0x4d, src) if f_with_halant in self._translit_model.f_sym_id_map: alpha[e_id, self._translit_model. f_sym_id_map[f_with_halant]] = params[ 'base_measure_mapping_exists'] return alpha
def _init_to_anusvaara_strict(self): """ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')` """ pat_signatures=\ [ [0x19,0x15,0x18], [0x1e,0x1a,0x1d], [0x23,0x1f,0x22], [0x28,0x24,0x27], [0x29,0x24,0x27], [0x2e,0x2a,0x2d], ] halant_offset=0x4d anusvaara_offset=0x02 pats=[] for pat_signature in pat_signatures: pat=re.compile(r'{nasal}{halant}([{start_r}-{end_r}])'.format( nasal=langinfo.offset_to_char(pat_signature[0],self.lang), halant=langinfo.offset_to_char(halant_offset,self.lang), start_r=langinfo.offset_to_char(pat_signature[1],self.lang), end_r=langinfo.offset_to_char(pat_signature[2],self.lang), )) pats.append(pat) repl_string='{anusvaara}\\1'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang)) self.pats_repls=(pats,repl_string)
def compare_character_ratios(mono_datadir): ## read language data lang_data=[] for lang in langs: with codecs.open('{}/{}.yaml'.format(mono_datadir,lang),'r','utf-8') as datafile: lang_data.append(yaml.load(datafile)) ### Plot character ratios charratio_mat=np.zeros((len(langs),langinfo.COORDINATED_RANGE_END_INCLUSIVE-langinfo.COORDINATED_RANGE_START_INCLUSIVE+1)) for i,lang in enumerate(langs): for c,v in lang_data[i]['char_proportions'].iteritems(): charratio_mat[i,c]=v ## plot matplotlib.rc('font', family='Lohit Hindi') fig, ax = plt.subplots() plt.pcolor(charratio_mat,cmap=plt.cm.hot_r,edgecolors='k') plt.colorbar() plt.xticks(np.arange(0,charratio_mat.shape[1])+0.5,[ langinfo.offset_to_char(o,'hi') for o in xrange(langinfo.COORDINATED_RANGE_START_INCLUSIVE,langinfo.COORDINATED_RANGE_END_INCLUSIVE+1)]) plt.yticks(np.arange(0,charratio_mat.shape[0])+0.5,xrange(len(langs))) plt.show() plt.close()
def _init_normalize_chandras(self): substitution_offsets =\ [ [0x0d , 0x0f], # chandra e, independent [0x11 , 0x13], # chandra o, independent [0x45 , 0x47], # chandra e , 0xde],pendent [0x49 , 0x4b], # chandra o , 0xde],pendent # [0x72 , 0x0f], # mr: chandra e, independent [0x00 , 0x02], # chandrabindu [0x01 , 0x02], # chandrabindu ] self.chandra_substitutions = [ (langinfo.offset_to_char(x[0], self.lang), langinfo.offset_to_char(x[1], self.lang)) for x in substitution_offsets ]
def _init_to_anusvaara_relaxed(self): """ `r1_nasal=re.compile(r'\\u0919\\u094D([\\u0915-\\u0918])')` """ nasals_list=[0x19,0x1e,0x23,0x28,0x29,0x2e] nasals_list_str=','.join([langinfo.offset_to_char(x,self.lang) for x in nasals_list]) halant_offset=0x4d anusvaara_offset=0x02 pat=re.compile(r'[{nasals_list_str}]{halant}'.format( nasals_list_str=nasals_list_str, halant=langinfo.offset_to_char(halant_offset,self.lang), )) repl_string='{anusvaara}'.format(anusvaara=langinfo.offset_to_char(anusvaara_offset,self.lang)) self.pats_repls = (pat,repl_string)
def _normalize_word_vowel_ending_dravidian(self, word): """ for Dravidian - consonant ending: add 'a' ki maatra - halant ending: no change - 'a' ki maatra: no change """ if len(word) > 0 and langinfo.is_consonant(word[-1], self.lang): return word + langinfo.offset_to_char(0x3e, self.lang) else: return word
def _normalize_word_vowel_ending_ie(self, word): """ for IE - consonant ending: add halant - halant ending: no change - 'a' ki maatra: no change """ if len(word) > 0 and langinfo.is_consonant(word[-1], self.lang): return word + langinfo.offset_to_char(langinfo.HALANTA_OFFSET, self.lang) else: return word
def from_itrans(text, lang): """ TODO: Document this method properly TODO: A little hack is used to handle schwa: needs to be documented TODO: check for robustness """ MAXCODE = 4 ### TODO: Needs to be fixed ## handle_duplicate_itrans_representations for k, v in DUPLICATE_ITRANS_REPRESENTATIONS.items(): if k in text: text = text.replace(k, v) start = 0 match = None solution = [] i = start + 1 while i <= len(text): itrans = text[start:i] # print('===') # print('i: {}'.format(i)) # if i<len(text): # print('c: {}'.format(text[i-1])) # print('start: {}'.format(start)) # print('itrans: {}'.format(itrans)) if itrans in ITRANS_TO_OFFSET: offs = ITRANS_TO_OFFSET[itrans] ## single element list - no problem ## except when it is 'a' ## 2 element list of 2 kinds: ### 1. alternate char for independent/dependent vowel ### 2. consonant + halant if len(offs)==2 and \ langinfo.is_vowel_offset(offs[0]): ### 1. alternate char for independent/dependent vowel ## if previous is a consonant, then use the dependent vowel if len(solution) > 0 and langinfo.is_halanta( solution[-1], lang): offs = [offs[1]] ## dependent vowel else: offs = [offs[0]] ## independent vowel c = ''.join([langinfo.offset_to_char(x, lang) for x in offs]) match = (i, c) elif len(itrans) == 1: ## unknown character match = (i, itrans) elif i < len(text) and ( i - start ) < MAXCODE + 1: ## continue matching till MAXCODE length substring i = i + 1 continue else: solution.extend(match[1]) # start=i-1 start = match[0] i = start match = None # print('match done') # print('match: {}'.format(match)) i = i + 1 ### flush matches if match is not None: solution.extend(match[1]) #### post-processing ## delete unecessary halants # print(''.join(solution)) temp_out = list(''.join(solution)) rem_indices = [] for i in range(len(temp_out) - 1): if langinfo.is_halanta(temp_out[i],lang) and \ (langinfo.is_vowel_sign(temp_out[i+1],lang) \ or langinfo.is_nukta(temp_out[i+1],lang) \ or temp_out[i+1]==langinfo.offset_to_char(0x7f,lang)): rem_indices.append(i) # if temp_out[i]==langinfo.offset_to_char(0x7f,lang): # rem_indices.append(i) for i in reversed(rem_indices): temp_out.pop(i) out = ''.join(temp_out) ## delete schwa placeholder out = out.replace(langinfo.offset_to_char(0x7f, lang), '') return out
import sys from indicnlp import langinfo from indicnlp import loader if __name__ == '__main__': """ This script corrects the incorrect tokenization done by Moses tokenizer. The Moses tokenizer splits on nukta and halant characters Usage: python correct_moses_tokenizer.py <infname> <outfname> <langcode> """ loader.load() infname = sys.argv[1] outfname = sys.argv[2] lang = sys.argv[3] halant_char = langinfo.offset_to_char(langinfo.HALANTA_OFFSET, lang) nukta_char = langinfo.offset_to_char(langinfo.NUKTA_OFFSET, lang) with open(infname,'r',encoding='utf-8') as infile, \ open(outfname,'w',encoding='utf-8') as outfile: for line in infile: outfile.write( line.replace(' {} '.format(halant_char), halant_char).replace( ' {} '.format(nukta_char), nukta_char).replace( ' {}{}'.format(nukta_char, halant_char), '{}{}'.format(nukta_char, halant_char)))