Exemple #1
0
def lcsr(srcw, tgtw, slang, tlang):

    if slang == tlang or not isc.is_supported_language(
            slang) or not isc.is_supported_language(tlang):
        return lcsr_any(srcw, tgtw, slang, tlang)
    else:
        return lcsr_indic(srcw, tgtw, slang, tlang)
 def is_consonant(self,c,lang): 
     if isc.is_supported_language(lang): 
         return isc.is_consonant(isc.get_phonetic_feature_vector(c,lang))
     elif lang in self.consonant_set: 
         return c in self.consonant_set[lang]
     else: 
         raise Exception('Language no supported. Add list of consonants for this language')
Exemple #3
0
 def is_vowel(self, c, lang):
     if isc.is_supported_language(lang):
         return isc.is_vowel(isc.get_phonetic_feature_vector(c, lang))
     elif lang in self.vowel_set:
         return c in self.vowel_set[lang]
     else:
         raise Exception(
             'Language no supported. Add list of vowels for this language')
Exemple #4
0
def get_label(x, lang):
    if isc.is_supported_language(lang):
        if isc.in_coordinated_range(x, lang):
            return indtrans.ItransTransliterator.to_itrans(
                x, lang) + '({:2x})'.format(isc.get_offset(x, lang))
        else:
            return str(hex(ord(x)))
    else:
        return x
Exemple #5
0
def get_column_name(x, tlang):
    """
     Get column name (char) in ascii (romanized) 
    """
    if isc.is_supported_language(tlang):
        #return x if tlang=='hi' else indtrans.UnicodeIndicTransliterator.transliterate(x,tlang,'hi')
        if isc.in_coordinated_range(x, tlang):
            return indtrans.ItransTransliterator.to_itrans(
                x, tlang) + '({:2x})'.format(isc.get_offset(x, tlang))
        else:
            return str(hex(ord(x)))
    elif tlang == 'ar':
        return a2r_xlit.transliterate(x)
    else:
        return x
Exemple #6
0
def input_chars_to_analyze():
    """
    Input in code: what characters to input 
    """
    chars_to_analyze = []
    if FLAGS.lang == 'en':
        chars_to_analyze = ['A', 'E', 'I', 'O', 'U']
    elif isc.is_supported_language(FLAGS.lang):
        offsets_to_analyze = range(0x3e,
                                   0x4d)  ## only vowel diacritics included
        chars_to_analyze = [
            isc.offset_to_char(x, FLAGS.lang) for x in offsets_to_analyze
        ]
    elif slavic_characters.is_supported_language_latin(FLAGS.lang):
        chars_to_analyze = slavic_characters.latin_vowels
        #chars_to_analyze=['A','E','I','O','U']
        #chars_to_analyze=['K','C','F','V','P','B']

    return chars_to_analyze
Exemple #7
0
 def is_supported_language(self, lang):
     return isc.is_supported_language(lang) or lang in self.vowel_set
Exemple #8
0
def run_sort_errors(basedir, exp_conf_fname):

    ## read the list of experiments to be analyzed
    print 'Read list of experiments'
    conf_df = pd.read_csv(exp_conf_fname, header=0, sep=',')

    augmented_data = []

    for rec in [x[1] for x in conf_df.iterrows()]:

        slang = rec['src']
        tlang = rec['tgt']
        epoch = rec['epoch']

        edir = get_edir(rec)

        exp_dirname = '{basedir}/results/sup/{dataset}/{exp}/{rep}/{edir}'.format(
            basedir=basedir,
            dataset=rec['dataset'],
            rep=rec['representation'],
            exp=rec['exp'],
            edir=edir)

        out_dirname = '{exp_dirname}/outputs/{epoch:03d}_analysis_{slang}-{tlang}'.format(
            exp_dirname=exp_dirname, epoch=epoch, slang=slang, tlang=tlang)

        print 'Starting Experiment: ' + exp_dirname
        if os.path.isdir(out_dirname):
            a_df = align.read_align_count_file(
                '{}/alignment_count.csv'.format(out_dirname))
            err_df = a_df[a_df.ref_char != a_df.out_char].copy(deep=True)
            if isc.is_supported_language(tlang):
                err_df['roman_ref'] = err_df.apply(
                    lambda x: (indtrans.ItransTransliterator.to_itrans(
                        x['ref_char'], tlang)),
                    axis=1)
                err_df['roman_out'] = err_df.apply(
                    lambda x: (indtrans.ItransTransliterator.to_itrans(
                        x['out_char'], tlang)),
                    axis=1)
                err_df['unicode_ref'] = err_df.apply(
                    lambda x:
                    ('{:2x}'.format(isc.get_offset(x['ref_char'], tlang))),
                    axis=1)
                err_df['unicode_out'] = err_df.apply(
                    lambda x:
                    ('{:2x}'.format(isc.get_offset(x['out_char'], tlang))),
                    axis=1)
            if tlang == 'ar':
                err_df['roman_ref'] = err_df.apply(
                    lambda x: (a2r_xlit.transliterate(x['ref_char'])), axis=1)
                err_df['roman_out'] = err_df.apply(
                    lambda x: (a2r_xlit.transliterate(x['out_char'])), axis=1)
                err_df['unicode_ref'] = err_df.apply(
                    lambda x: ('{:4x}'.format(ord(x['ref_char']))), axis=1)
                err_df['unicode_out'] = err_df.apply(
                    lambda x: ('{:4x}'.format(ord(x['out_char']))), axis=1)
            if align.cci.is_supported_language(tlang):
                err_df['charcat_ref'] = err_df.apply(
                    lambda x: align.cci.get_char_type(x['ref_char'], tlang),
                    axis=1)
                err_df['charcat_out'] = err_df.apply(
                    lambda x: align.cci.get_char_type(x['out_char'], tlang),
                    axis=1)

            err_df.sort_values(by='count',
                               axis=0,
                               ascending=False,
                               inplace=True)
            err_df.to_csv('{}/err_count.csv'.format(out_dirname),
                          encoding='utf-8')
        else:
            print 'WARNING (run_sort_errors): Could not analyze following experiment: {} {} {} {} {} epoch: {}'.format(
                rec['dataset'], rec['exp'], rec['representation'], slang,
                tlang, epoch)
        print 'End Experiment: ' + exp_dirname
 def is_supported_language(self,lang): 
     return isc.is_supported_language(lang) or lang in self.vowel_set