コード例 #1
0
ファイル: utilities.py プロジェクト: prajdabre/knmt
def ned(srcw, tgtw, slang, tlang, w_del=1.0, w_ins=1.0, w_sub=1.0):
    score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1))

    score_mat[:,
              0] = np.array([si * w_del for si in xrange(score_mat.shape[0])])
    score_mat[0, :] = np.array(
        [ti * w_ins for ti in xrange(score_mat.shape[1])])

    for si, sc in enumerate(srcw, 1):
        for ti, tc in enumerate(tgtw, 1):
            so = isc.get_offset(sc, slang)
            to = isc.get_offset(tc, tlang)
            if isc.in_coordinated_range_offset(
                    so) and isc.in_coordinated_range_offset(to) and so == to:
                score_mat[si, ti] = score_mat[si - 1, ti - 1]
            elif not (isc.in_coordinated_range_offset(so)
                      or isc.in_coordinated_range_offset(to)) and sc == tc:
                score_mat[si, ti] = score_mat[si - 1, ti - 1]
            else:
                score_mat[si, ti] = min(
                    score_mat[si - 1, ti - 1] + w_sub,
                    score_mat[si, ti - 1] + w_ins,
                    score_mat[si - 1, ti] + w_del,
                )
    return (score_mat[-1, -1], float(len(srcw)), float(len(tgtw)))
コード例 #2
0
    def to_itrans(text, lang_code):
        if lang_code in langinfo.SCRIPT_RANGES:
            if lang_code == 'ml':
                # Change from chillus characters to corresponding consonant+halant
                text = text.replace('\u0d7a', '\u0d23\u0d4d')
                text = text.replace('\u0d7b', '\u0d28\u0d4d')
                text = text.replace('\u0d7c', '\u0d30\u0d4d')
                text = text.replace('\u0d7d', '\u0d32\u0d4d')
                text = text.replace('\u0d7e', '\u0d33\u0d4d')
                text = text.replace('\u0d7f', '\u0d15\u0d4d')

            offsets = [isc.get_offset(c, lang_code) for c in text]

            ### naive lookup
            # itrans_l = [ OFFSET_TO_ITRANS.get(o, '-' ) for o in offsets ]
            itrans_l = []
            for o in offsets:
                itrans = OFFSET_TO_ITRANS.get(
                    o, chr(langinfo.SCRIPT_RANGES[lang_code][0] + o))
                if langinfo.is_halanta_offset(o):
                    itrans = ''
                    if len(itrans_l) > 0:
                        itrans_l.pop()
                elif langinfo.is_vowel_sign_offset(o) and len(itrans_l) > 0:
                    itrans_l.pop()
                itrans_l.extend(itrans)

            return ''.join(itrans_l)

        else:
            return text
コード例 #3
0
ファイル: encoder_analysis.py プロジェクト: satishkrr/mlxlit
def get_label(x, lang):
    if isc.is_supported_language(lang):
        if isc.in_coordinated_range(x, lang):
            return indtrans.ItransTransliterator.to_itrans(
                x, lang) + '({:2x})'.format(isc.get_offset(x, lang))
        else:
            return str(hex(ord(x)))
    else:
        return x
コード例 #4
0
ファイル: utilities.py プロジェクト: satishkrr/mlxlit
def lcsr_indic(srcw, tgtw, slang, tlang):
    score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1))

    for si, sc in enumerate(srcw, 1):
        for ti, tc in enumerate(tgtw, 1):
            so = isc.get_offset(sc, slang)
            to = isc.get_offset(tc, tlang)

            if isc.in_coordinated_range_offset(
                    so) and isc.in_coordinated_range_offset(to) and so == to:
                score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0
            elif not (isc.in_coordinated_range_offset(so)
                      or isc.in_coordinated_range_offset(to)) and sc == tc:
                score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0
            else:
                score_mat[si, ti] = max(score_mat[si, ti - 1],
                                        score_mat[si - 1, ti])

    return (score_mat[-1, -1] / float(max(len(srcw), len(tgtw))),
            float(len(srcw)), float(len(tgtw)))
コード例 #5
0
def get_column_name(x, tlang):
    """
     Get column name (char) in ascii (romanized) 
    """
    if isc.is_supported_language(tlang):
        #return x if tlang=='hi' else indtrans.UnicodeIndicTransliterator.transliterate(x,tlang,'hi')
        if isc.in_coordinated_range(x, tlang):
            return indtrans.ItransTransliterator.to_itrans(
                x, tlang) + '({:2x})'.format(isc.get_offset(x, tlang))
        else:
            return str(hex(ord(x)))
    elif tlang == 'ar':
        return a2r_xlit.transliterate(x)
    else:
        return x
コード例 #6
0
ファイル: Mapping.py プロジェクト: satishkrr/mlxlit
    def get_index(self, c, lang=None):

        if len(c) == 1 and lang is not None and isc.in_coordinated_range(
                c, lang):
            pid = isc.get_offset(c, lang)
            c_hi = isc.offset_to_char(pid, 'hi')
            if (not self.update_mode) and (c_hi not in self.vocab_c2i):
                c_hi = Mapping.UNK
            index = self.vocab_c2i[c_hi]
            if self.update_mode:
                self.indic_i2pid[index] = pid
        else:
            if (not self.update_mode) and (c not in self.vocab_c2i):
                c = Mapping.UNK
            index = self.vocab_c2i[c]

        if self.update_mode:
            self.lang_list.add(lang)

        return index
コード例 #7
0
ファイル: encoder_analysis.py プロジェクト: satishkrr/mlxlit
def main(argv=None):
    """
     Main function for the program 
    """
    def prepare_data():

        print 'Reading test data'

        test_data = MonoDataReader.MonoDataReader(FLAGS.lang, FLAGS.in_fname,
                                                  mapping[FLAGS.lang],
                                                  FLAGS.max_seq_length)
        sequences, sequence_masks, sequence_lengths = test_data.get_data()

        #return (sequences, None, sequence_lengths, sequence_masks)

        new_seq_data = []
        new_seq_lengths = []
        new_seq_pos = []

        for i in range(0, sequences.shape[0]):
            for j in range(0, sequence_lengths[i]):
                if sequences[i, j] in char_ids_to_analyze:
                    start = max(1, j - FLAGS.window_size)  ## GO not considered
                    end = min(sequence_lengths[i] - 1,
                              j + FLAGS.window_size + 1)  ## EOW not considered
                    l = end - start + 2  ## 2 to account for EOW and GO
                    seq_slice = np.concatenate([
                        [mapping[FLAGS.lang].get_index(Mapping.Mapping.GO)],
                        sequences[i, start:end],
                        [mapping[FLAGS.lang].get_index(Mapping.Mapping.EOW)],
                        [mapping[FLAGS.lang].get_index(Mapping.Mapping.PAD)] *
                        (FLAGS.max_seq_length - l),
                    ])
                    new_seq_data.append(seq_slice)
                    new_seq_lengths.append(l)
                    new_seq_pos.append(j - start + 1)

        ### add points for the vocabulary without context
        ## single character
        #for cid in char_ids_to_analyze:
        #    seq_slice=np.concatenate(   [
        #                                    [mapping[FLAGS.lang].get_index(Mapping.Mapping.GO)],
        #                                    [cid],
        #                                    [mapping[FLAGS.lang].get_index(Mapping.Mapping.EOW)],
        #                                    [mapping[FLAGS.lang].get_index(Mapping.Mapping.PAD)]*(FLAGS.max_seq_length-3),
        #                                ]  )
        #    new_seq_data.append(seq_slice)
        #    new_seq_lengths.append(3)
        #    new_seq_pos.append(1)

        #for cid in char_ids_to_analyze:
        ## character thrice
        #    seq_slice=np.concatenate(   [
        #                                    [mapping[FLAGS.lang].get_index(Mapping.Mapping.GO)],
        #                                    [cid]*3,
        #                                    [mapping[FLAGS.lang].get_index(Mapping.Mapping.EOW)],
        #                                    [mapping[FLAGS.lang].get_index(Mapping.Mapping.PAD)]*(FLAGS.max_seq_length-5),
        #                                ]  )
        #    new_seq_data.append(seq_slice)
        #    new_seq_lengths.append(5)
        #    new_seq_pos.append(2)

        # Creating masks. Mask has size = size of list of sequence.
        # Corresponding to each PAD character there is a zero, for all other there is a 1
        new_seq_masks = np.zeros([len(new_seq_data), FLAGS.max_seq_length],
                                 dtype=np.float32)
        for i in range(len(new_seq_data)):
            new_seq_masks[i][:new_seq_lengths[i]] = 1

        return (np.asarray(new_seq_data, dtype=np.int32),
                np.asarray(new_seq_pos, dtype=np.int32),
                np.asarray(new_seq_lengths, dtype=np.int32), new_seq_masks)

    def create_graph():

        print "Start graph creation"
        # Creating Model object
        model = AttentionModel.AttentionModel(
            mapping, representation, FLAGS.max_seq_length,
            FLAGS.embedding_size, FLAGS.enc_rnn_size, FLAGS.dec_rnn_size,
            FLAGS.enc_type, FLAGS.separate_output_embedding)

        ## Creating placeholder for sequences, masks and lengths and dropout keep probability
        batch_sequences = tf.placeholder(shape=[None, FLAGS.max_seq_length],
                                         dtype=tf.int32)
        batch_sequence_lengths = tf.placeholder(shape=[None], dtype=tf.float32)

        # Predict output for test sequences
        o_enc_outputs = compute_hidden_representation(model, batch_sequences,
                                                      batch_sequence_lengths,
                                                      FLAGS.lang)

        return batch_sequences, batch_sequence_lengths, o_enc_outputs
        print "Done with creating graph. Starting session"

    def run_graph():

        print "Starting session"

        saver = tf.train.Saver(max_to_keep=3)

        #Start Session
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        sess.run(tf.initialize_all_variables())
        saver.restore(sess, FLAGS.model_fname)

        print "Session started"

        test_time = 0.0
        b_enc_outputs_list = []

        print 'Starting execution'
        for start in xrange(0, sequences.shape[0], FLAGS.batch_size):
            end = min(start + FLAGS.batch_size, sequences.shape[0])

            batch_start_time = time.time()

            data_sequences = sequences[start:end, :]
            data_sequence_lengths = sequence_lengths[start:end]

            b_enc_outputs = sess.run(o_enc_outputs,
                                     feed_dict={
                                         batch_sequences:
                                         data_sequences,
                                         batch_sequence_lengths:
                                         data_sequence_lengths
                                     })
            b_enc_outputs_list.append(b_enc_outputs)

            batch_end_time = time.time()
            test_time += (batch_end_time - batch_start_time)

            print 'Encoded {} of {} sequences'.format(end, sequences.shape[0])
            sys.stdout.flush()

        enc_outputs = np.concatenate(b_enc_outputs_list, axis=0)
        print 'Ending execution'

        return enc_outputs

    ################## WORK STARTS HERE ############

    ##### Obtaining Encoder Embeddings
    representation = init_representation()

    mapping = init_mapping(representation)

    chars_to_analyze = input_chars_to_analyze()

    char_ids_to_analyze = [
        mapping[FLAGS.lang].get_index(x, FLAGS.lang) for x in chars_to_analyze
    ]

    sequences, sequence_pos, sequence_lengths, sequence_masks, = prepare_data()

    batch_sequences, batch_sequence_lengths, o_enc_outputs = create_graph()

    enc_outputs = run_graph()

    #### Prepare data for visualization

    char_ctx_embed_list = []
    char_list = []

    #### for window based approach
    for i in range(0, sequences.shape[0]):
        pos = sequence_pos[i]
        char_ctx_embed_list.append(enc_outputs[i, pos, :])
        char_list.append(sequences[i, pos])

    char_ctx_embed_flat = np.array(char_ctx_embed_list)

    ### call tsne
    low_embedder = TSNE()
    low_embeddings = low_embedder.fit_transform(char_ctx_embed_flat)

    ### plot
    N = low_embeddings.shape[0]
    x = low_embeddings[:, 0]
    y = low_embeddings[:, 1]

    cols_list = np.arange(0.0, 1.0, 1 / float(len(chars_to_analyze)))
    char_col_map = {}
    for i, c in enumerate(char_ids_to_analyze):
        char_col_map[c] = cols_list[i]

    colors_data = [char_col_map[c] for c in char_list]

    print N
    cm = plt.get_cmap('jet')
    vs = len(char_ids_to_analyze)

    fig, ax = plt.subplots()
    scatter = ax.scatter(x, y, c=colors_data, cmap=cm, alpha=0.5)
    #plt.scatter(x[-vs:], y[-vs:], c=colors_data[-vs:], cmap=cm, alpha=1.0, marker='x')

    gen_label = lambda char: (indtrans.ItransTransliterator.to_itrans(
        char, FLAGS.lang) + '({:2x})'.format(isc.get_offset(char, FLAGS.lang)))
    patches = [
        mpatches.Patch(label=get_label(char, FLAGS.lang), color=cm(color))
        for char, color in zip(chars_to_analyze, cols_list)
    ]
    ax.legend(handles=patches, ncol=3, fontsize='xx-small')

    labels = []
    for i in range(0, sequences.shape[0]):
        labels.append(u''.join([
            mapping[FLAGS.lang].get_char(sequences[i, j], FLAGS.lang)
            for j in range(1, sequence_lengths[i] - 1)
        ]))

    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)

    if FLAGS.out_html_fname != '':
        mpld3.save_html(fig, FLAGS.out_html_fname)
    ##mpld3.show(ip='10.129.2.170',port=10002, open_browser=False)

    if FLAGS.out_img_fname != '':
        plt.savefig(FLAGS.out_img_fname)
コード例 #8
0
def run_sort_errors(basedir, exp_conf_fname):

    ## read the list of experiments to be analyzed
    print 'Read list of experiments'
    conf_df = pd.read_csv(exp_conf_fname, header=0, sep=',')

    augmented_data = []

    for rec in [x[1] for x in conf_df.iterrows()]:

        slang = rec['src']
        tlang = rec['tgt']
        epoch = rec['epoch']

        edir = get_edir(rec)

        exp_dirname = '{basedir}/results/sup/{dataset}/{exp}/{rep}/{edir}'.format(
            basedir=basedir,
            dataset=rec['dataset'],
            rep=rec['representation'],
            exp=rec['exp'],
            edir=edir)

        out_dirname = '{exp_dirname}/outputs/{epoch:03d}_analysis_{slang}-{tlang}'.format(
            exp_dirname=exp_dirname, epoch=epoch, slang=slang, tlang=tlang)

        print 'Starting Experiment: ' + exp_dirname
        if os.path.isdir(out_dirname):
            a_df = align.read_align_count_file(
                '{}/alignment_count.csv'.format(out_dirname))
            err_df = a_df[a_df.ref_char != a_df.out_char].copy(deep=True)
            if isc.is_supported_language(tlang):
                err_df['roman_ref'] = err_df.apply(
                    lambda x: (indtrans.ItransTransliterator.to_itrans(
                        x['ref_char'], tlang)),
                    axis=1)
                err_df['roman_out'] = err_df.apply(
                    lambda x: (indtrans.ItransTransliterator.to_itrans(
                        x['out_char'], tlang)),
                    axis=1)
                err_df['unicode_ref'] = err_df.apply(
                    lambda x:
                    ('{:2x}'.format(isc.get_offset(x['ref_char'], tlang))),
                    axis=1)
                err_df['unicode_out'] = err_df.apply(
                    lambda x:
                    ('{:2x}'.format(isc.get_offset(x['out_char'], tlang))),
                    axis=1)
            if tlang == 'ar':
                err_df['roman_ref'] = err_df.apply(
                    lambda x: (a2r_xlit.transliterate(x['ref_char'])), axis=1)
                err_df['roman_out'] = err_df.apply(
                    lambda x: (a2r_xlit.transliterate(x['out_char'])), axis=1)
                err_df['unicode_ref'] = err_df.apply(
                    lambda x: ('{:4x}'.format(ord(x['ref_char']))), axis=1)
                err_df['unicode_out'] = err_df.apply(
                    lambda x: ('{:4x}'.format(ord(x['out_char']))), axis=1)
            if align.cci.is_supported_language(tlang):
                err_df['charcat_ref'] = err_df.apply(
                    lambda x: align.cci.get_char_type(x['ref_char'], tlang),
                    axis=1)
                err_df['charcat_out'] = err_df.apply(
                    lambda x: align.cci.get_char_type(x['out_char'], tlang),
                    axis=1)

            err_df.sort_values(by='count',
                               axis=0,
                               ascending=False,
                               inplace=True)
            err_df.to_csv('{}/err_count.csv'.format(out_dirname),
                          encoding='utf-8')
        else:
            print 'WARNING (run_sort_errors): Could not analyze following experiment: {} {} {} {} {} epoch: {}'.format(
                rec['dataset'], rec['exp'], rec['representation'], slang,
                tlang, epoch)
        print 'End Experiment: ' + exp_dirname