def lcsr(srcw, tgtw, slang, tlang): if slang == tlang or not isc.is_supported_language( slang) or not isc.is_supported_language(tlang): return lcsr_any(srcw, tgtw, slang, tlang) else: return lcsr_indic(srcw, tgtw, slang, tlang)
def is_consonant(self,c,lang): if isc.is_supported_language(lang): return isc.is_consonant(isc.get_phonetic_feature_vector(c,lang)) elif lang in self.consonant_set: return c in self.consonant_set[lang] else: raise Exception('Language no supported. Add list of consonants for this language')
def is_vowel(self, c, lang): if isc.is_supported_language(lang): return isc.is_vowel(isc.get_phonetic_feature_vector(c, lang)) elif lang in self.vowel_set: return c in self.vowel_set[lang] else: raise Exception( 'Language no supported. Add list of vowels for this language')
def get_label(x, lang): if isc.is_supported_language(lang): if isc.in_coordinated_range(x, lang): return indtrans.ItransTransliterator.to_itrans( x, lang) + '({:2x})'.format(isc.get_offset(x, lang)) else: return str(hex(ord(x))) else: return x
def get_column_name(x, tlang): """ Get column name (char) in ascii (romanized) """ if isc.is_supported_language(tlang): #return x if tlang=='hi' else indtrans.UnicodeIndicTransliterator.transliterate(x,tlang,'hi') if isc.in_coordinated_range(x, tlang): return indtrans.ItransTransliterator.to_itrans( x, tlang) + '({:2x})'.format(isc.get_offset(x, tlang)) else: return str(hex(ord(x))) elif tlang == 'ar': return a2r_xlit.transliterate(x) else: return x
def input_chars_to_analyze(): """ Input in code: what characters to input """ chars_to_analyze = [] if FLAGS.lang == 'en': chars_to_analyze = ['A', 'E', 'I', 'O', 'U'] elif isc.is_supported_language(FLAGS.lang): offsets_to_analyze = range(0x3e, 0x4d) ## only vowel diacritics included chars_to_analyze = [ isc.offset_to_char(x, FLAGS.lang) for x in offsets_to_analyze ] elif slavic_characters.is_supported_language_latin(FLAGS.lang): chars_to_analyze = slavic_characters.latin_vowels #chars_to_analyze=['A','E','I','O','U'] #chars_to_analyze=['K','C','F','V','P','B'] return chars_to_analyze
def is_supported_language(self, lang): return isc.is_supported_language(lang) or lang in self.vowel_set
def run_sort_errors(basedir, exp_conf_fname): ## read the list of experiments to be analyzed print 'Read list of experiments' conf_df = pd.read_csv(exp_conf_fname, header=0, sep=',') augmented_data = [] for rec in [x[1] for x in conf_df.iterrows()]: slang = rec['src'] tlang = rec['tgt'] epoch = rec['epoch'] edir = get_edir(rec) exp_dirname = '{basedir}/results/sup/{dataset}/{exp}/{rep}/{edir}'.format( basedir=basedir, dataset=rec['dataset'], rep=rec['representation'], exp=rec['exp'], edir=edir) out_dirname = '{exp_dirname}/outputs/{epoch:03d}_analysis_{slang}-{tlang}'.format( exp_dirname=exp_dirname, epoch=epoch, slang=slang, tlang=tlang) print 'Starting Experiment: ' + exp_dirname if os.path.isdir(out_dirname): a_df = align.read_align_count_file( '{}/alignment_count.csv'.format(out_dirname)) err_df = a_df[a_df.ref_char != a_df.out_char].copy(deep=True) if isc.is_supported_language(tlang): err_df['roman_ref'] = err_df.apply( lambda x: (indtrans.ItransTransliterator.to_itrans( x['ref_char'], tlang)), axis=1) err_df['roman_out'] = err_df.apply( lambda x: (indtrans.ItransTransliterator.to_itrans( x['out_char'], tlang)), axis=1) err_df['unicode_ref'] = err_df.apply( lambda x: ('{:2x}'.format(isc.get_offset(x['ref_char'], tlang))), axis=1) err_df['unicode_out'] = err_df.apply( lambda x: ('{:2x}'.format(isc.get_offset(x['out_char'], tlang))), axis=1) if tlang == 'ar': err_df['roman_ref'] = err_df.apply( lambda x: (a2r_xlit.transliterate(x['ref_char'])), axis=1) err_df['roman_out'] = err_df.apply( lambda x: (a2r_xlit.transliterate(x['out_char'])), axis=1) err_df['unicode_ref'] = err_df.apply( lambda x: ('{:4x}'.format(ord(x['ref_char']))), axis=1) err_df['unicode_out'] = err_df.apply( lambda x: ('{:4x}'.format(ord(x['out_char']))), axis=1) if align.cci.is_supported_language(tlang): err_df['charcat_ref'] = err_df.apply( lambda x: align.cci.get_char_type(x['ref_char'], tlang), axis=1) err_df['charcat_out'] = err_df.apply( lambda x: align.cci.get_char_type(x['out_char'], tlang), axis=1) err_df.sort_values(by='count', axis=0, ascending=False, inplace=True) err_df.to_csv('{}/err_count.csv'.format(out_dirname), encoding='utf-8') else: print 'WARNING (run_sort_errors): Could not analyze following experiment: {} {} {} {} {} epoch: {}'.format( rec['dataset'], rec['exp'], rec['representation'], slang, tlang, epoch) print 'End Experiment: ' + exp_dirname
def is_supported_language(self,lang): return isc.is_supported_language(lang) or lang in self.vowel_set