def ned(srcw, tgtw, slang, tlang, w_del=1.0, w_ins=1.0, w_sub=1.0): score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1)) score_mat[:, 0] = np.array([si * w_del for si in xrange(score_mat.shape[0])]) score_mat[0, :] = np.array( [ti * w_ins for ti in xrange(score_mat.shape[1])]) for si, sc in enumerate(srcw, 1): for ti, tc in enumerate(tgtw, 1): so = isc.get_offset(sc, slang) to = isc.get_offset(tc, tlang) if isc.in_coordinated_range_offset( so) and isc.in_coordinated_range_offset(to) and so == to: score_mat[si, ti] = score_mat[si - 1, ti - 1] elif not (isc.in_coordinated_range_offset(so) or isc.in_coordinated_range_offset(to)) and sc == tc: score_mat[si, ti] = score_mat[si - 1, ti - 1] else: score_mat[si, ti] = min( score_mat[si - 1, ti - 1] + w_sub, score_mat[si, ti - 1] + w_ins, score_mat[si - 1, ti] + w_del, ) return (score_mat[-1, -1], float(len(srcw)), float(len(tgtw)))
def to_itrans(text, lang_code): if lang_code in langinfo.SCRIPT_RANGES: if lang_code == 'ml': # Change from chillus characters to corresponding consonant+halant text = text.replace('\u0d7a', '\u0d23\u0d4d') text = text.replace('\u0d7b', '\u0d28\u0d4d') text = text.replace('\u0d7c', '\u0d30\u0d4d') text = text.replace('\u0d7d', '\u0d32\u0d4d') text = text.replace('\u0d7e', '\u0d33\u0d4d') text = text.replace('\u0d7f', '\u0d15\u0d4d') offsets = [isc.get_offset(c, lang_code) for c in text] ### naive lookup # itrans_l = [ OFFSET_TO_ITRANS.get(o, '-' ) for o in offsets ] itrans_l = [] for o in offsets: itrans = OFFSET_TO_ITRANS.get( o, chr(langinfo.SCRIPT_RANGES[lang_code][0] + o)) if langinfo.is_halanta_offset(o): itrans = '' if len(itrans_l) > 0: itrans_l.pop() elif langinfo.is_vowel_sign_offset(o) and len(itrans_l) > 0: itrans_l.pop() itrans_l.extend(itrans) return ''.join(itrans_l) else: return text
def get_label(x, lang): if isc.is_supported_language(lang): if isc.in_coordinated_range(x, lang): return indtrans.ItransTransliterator.to_itrans( x, lang) + '({:2x})'.format(isc.get_offset(x, lang)) else: return str(hex(ord(x))) else: return x
def lcsr_indic(srcw, tgtw, slang, tlang): score_mat = np.zeros((len(srcw) + 1, len(tgtw) + 1)) for si, sc in enumerate(srcw, 1): for ti, tc in enumerate(tgtw, 1): so = isc.get_offset(sc, slang) to = isc.get_offset(tc, tlang) if isc.in_coordinated_range_offset( so) and isc.in_coordinated_range_offset(to) and so == to: score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 elif not (isc.in_coordinated_range_offset(so) or isc.in_coordinated_range_offset(to)) and sc == tc: score_mat[si, ti] = score_mat[si - 1, ti - 1] + 1.0 else: score_mat[si, ti] = max(score_mat[si, ti - 1], score_mat[si - 1, ti]) return (score_mat[-1, -1] / float(max(len(srcw), len(tgtw))), float(len(srcw)), float(len(tgtw)))
def get_column_name(x, tlang): """ Get column name (char) in ascii (romanized) """ if isc.is_supported_language(tlang): #return x if tlang=='hi' else indtrans.UnicodeIndicTransliterator.transliterate(x,tlang,'hi') if isc.in_coordinated_range(x, tlang): return indtrans.ItransTransliterator.to_itrans( x, tlang) + '({:2x})'.format(isc.get_offset(x, tlang)) else: return str(hex(ord(x))) elif tlang == 'ar': return a2r_xlit.transliterate(x) else: return x
def get_index(self, c, lang=None): if len(c) == 1 and lang is not None and isc.in_coordinated_range( c, lang): pid = isc.get_offset(c, lang) c_hi = isc.offset_to_char(pid, 'hi') if (not self.update_mode) and (c_hi not in self.vocab_c2i): c_hi = Mapping.UNK index = self.vocab_c2i[c_hi] if self.update_mode: self.indic_i2pid[index] = pid else: if (not self.update_mode) and (c not in self.vocab_c2i): c = Mapping.UNK index = self.vocab_c2i[c] if self.update_mode: self.lang_list.add(lang) return index
def main(argv=None): """ Main function for the program """ def prepare_data(): print 'Reading test data' test_data = MonoDataReader.MonoDataReader(FLAGS.lang, FLAGS.in_fname, mapping[FLAGS.lang], FLAGS.max_seq_length) sequences, sequence_masks, sequence_lengths = test_data.get_data() #return (sequences, None, sequence_lengths, sequence_masks) new_seq_data = [] new_seq_lengths = [] new_seq_pos = [] for i in range(0, sequences.shape[0]): for j in range(0, sequence_lengths[i]): if sequences[i, j] in char_ids_to_analyze: start = max(1, j - FLAGS.window_size) ## GO not considered end = min(sequence_lengths[i] - 1, j + FLAGS.window_size + 1) ## EOW not considered l = end - start + 2 ## 2 to account for EOW and GO seq_slice = np.concatenate([ [mapping[FLAGS.lang].get_index(Mapping.Mapping.GO)], sequences[i, start:end], [mapping[FLAGS.lang].get_index(Mapping.Mapping.EOW)], [mapping[FLAGS.lang].get_index(Mapping.Mapping.PAD)] * (FLAGS.max_seq_length - l), ]) new_seq_data.append(seq_slice) new_seq_lengths.append(l) new_seq_pos.append(j - start + 1) ### add points for the vocabulary without context ## single character #for cid in char_ids_to_analyze: # seq_slice=np.concatenate( [ # [mapping[FLAGS.lang].get_index(Mapping.Mapping.GO)], # [cid], # [mapping[FLAGS.lang].get_index(Mapping.Mapping.EOW)], # [mapping[FLAGS.lang].get_index(Mapping.Mapping.PAD)]*(FLAGS.max_seq_length-3), # ] ) # new_seq_data.append(seq_slice) # new_seq_lengths.append(3) # new_seq_pos.append(1) #for cid in char_ids_to_analyze: ## character thrice # seq_slice=np.concatenate( [ # [mapping[FLAGS.lang].get_index(Mapping.Mapping.GO)], # [cid]*3, # [mapping[FLAGS.lang].get_index(Mapping.Mapping.EOW)], # [mapping[FLAGS.lang].get_index(Mapping.Mapping.PAD)]*(FLAGS.max_seq_length-5), # ] ) # new_seq_data.append(seq_slice) # new_seq_lengths.append(5) # new_seq_pos.append(2) # Creating masks. Mask has size = size of list of sequence. # Corresponding to each PAD character there is a zero, for all other there is a 1 new_seq_masks = np.zeros([len(new_seq_data), FLAGS.max_seq_length], dtype=np.float32) for i in range(len(new_seq_data)): new_seq_masks[i][:new_seq_lengths[i]] = 1 return (np.asarray(new_seq_data, dtype=np.int32), np.asarray(new_seq_pos, dtype=np.int32), np.asarray(new_seq_lengths, dtype=np.int32), new_seq_masks) def create_graph(): print "Start graph creation" # Creating Model object model = AttentionModel.AttentionModel( mapping, representation, FLAGS.max_seq_length, FLAGS.embedding_size, FLAGS.enc_rnn_size, FLAGS.dec_rnn_size, FLAGS.enc_type, FLAGS.separate_output_embedding) ## Creating placeholder for sequences, masks and lengths and dropout keep probability batch_sequences = tf.placeholder(shape=[None, FLAGS.max_seq_length], dtype=tf.int32) batch_sequence_lengths = tf.placeholder(shape=[None], dtype=tf.float32) # Predict output for test sequences o_enc_outputs = compute_hidden_representation(model, batch_sequences, batch_sequence_lengths, FLAGS.lang) return batch_sequences, batch_sequence_lengths, o_enc_outputs print "Done with creating graph. Starting session" def run_graph(): print "Starting session" saver = tf.train.Saver(max_to_keep=3) #Start Session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.initialize_all_variables()) saver.restore(sess, FLAGS.model_fname) print "Session started" test_time = 0.0 b_enc_outputs_list = [] print 'Starting execution' for start in xrange(0, sequences.shape[0], FLAGS.batch_size): end = min(start + FLAGS.batch_size, sequences.shape[0]) batch_start_time = time.time() data_sequences = sequences[start:end, :] data_sequence_lengths = sequence_lengths[start:end] b_enc_outputs = sess.run(o_enc_outputs, feed_dict={ batch_sequences: data_sequences, batch_sequence_lengths: data_sequence_lengths }) b_enc_outputs_list.append(b_enc_outputs) batch_end_time = time.time() test_time += (batch_end_time - batch_start_time) print 'Encoded {} of {} sequences'.format(end, sequences.shape[0]) sys.stdout.flush() enc_outputs = np.concatenate(b_enc_outputs_list, axis=0) print 'Ending execution' return enc_outputs ################## WORK STARTS HERE ############ ##### Obtaining Encoder Embeddings representation = init_representation() mapping = init_mapping(representation) chars_to_analyze = input_chars_to_analyze() char_ids_to_analyze = [ mapping[FLAGS.lang].get_index(x, FLAGS.lang) for x in chars_to_analyze ] sequences, sequence_pos, sequence_lengths, sequence_masks, = prepare_data() batch_sequences, batch_sequence_lengths, o_enc_outputs = create_graph() enc_outputs = run_graph() #### Prepare data for visualization char_ctx_embed_list = [] char_list = [] #### for window based approach for i in range(0, sequences.shape[0]): pos = sequence_pos[i] char_ctx_embed_list.append(enc_outputs[i, pos, :]) char_list.append(sequences[i, pos]) char_ctx_embed_flat = np.array(char_ctx_embed_list) ### call tsne low_embedder = TSNE() low_embeddings = low_embedder.fit_transform(char_ctx_embed_flat) ### plot N = low_embeddings.shape[0] x = low_embeddings[:, 0] y = low_embeddings[:, 1] cols_list = np.arange(0.0, 1.0, 1 / float(len(chars_to_analyze))) char_col_map = {} for i, c in enumerate(char_ids_to_analyze): char_col_map[c] = cols_list[i] colors_data = [char_col_map[c] for c in char_list] print N cm = plt.get_cmap('jet') vs = len(char_ids_to_analyze) fig, ax = plt.subplots() scatter = ax.scatter(x, y, c=colors_data, cmap=cm, alpha=0.5) #plt.scatter(x[-vs:], y[-vs:], c=colors_data[-vs:], cmap=cm, alpha=1.0, marker='x') gen_label = lambda char: (indtrans.ItransTransliterator.to_itrans( char, FLAGS.lang) + '({:2x})'.format(isc.get_offset(char, FLAGS.lang))) patches = [ mpatches.Patch(label=get_label(char, FLAGS.lang), color=cm(color)) for char, color in zip(chars_to_analyze, cols_list) ] ax.legend(handles=patches, ncol=3, fontsize='xx-small') labels = [] for i in range(0, sequences.shape[0]): labels.append(u''.join([ mapping[FLAGS.lang].get_char(sequences[i, j], FLAGS.lang) for j in range(1, sequence_lengths[i] - 1) ])) tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels) mpld3.plugins.connect(fig, tooltip) if FLAGS.out_html_fname != '': mpld3.save_html(fig, FLAGS.out_html_fname) ##mpld3.show(ip='10.129.2.170',port=10002, open_browser=False) if FLAGS.out_img_fname != '': plt.savefig(FLAGS.out_img_fname)
def run_sort_errors(basedir, exp_conf_fname): ## read the list of experiments to be analyzed print 'Read list of experiments' conf_df = pd.read_csv(exp_conf_fname, header=0, sep=',') augmented_data = [] for rec in [x[1] for x in conf_df.iterrows()]: slang = rec['src'] tlang = rec['tgt'] epoch = rec['epoch'] edir = get_edir(rec) exp_dirname = '{basedir}/results/sup/{dataset}/{exp}/{rep}/{edir}'.format( basedir=basedir, dataset=rec['dataset'], rep=rec['representation'], exp=rec['exp'], edir=edir) out_dirname = '{exp_dirname}/outputs/{epoch:03d}_analysis_{slang}-{tlang}'.format( exp_dirname=exp_dirname, epoch=epoch, slang=slang, tlang=tlang) print 'Starting Experiment: ' + exp_dirname if os.path.isdir(out_dirname): a_df = align.read_align_count_file( '{}/alignment_count.csv'.format(out_dirname)) err_df = a_df[a_df.ref_char != a_df.out_char].copy(deep=True) if isc.is_supported_language(tlang): err_df['roman_ref'] = err_df.apply( lambda x: (indtrans.ItransTransliterator.to_itrans( x['ref_char'], tlang)), axis=1) err_df['roman_out'] = err_df.apply( lambda x: (indtrans.ItransTransliterator.to_itrans( x['out_char'], tlang)), axis=1) err_df['unicode_ref'] = err_df.apply( lambda x: ('{:2x}'.format(isc.get_offset(x['ref_char'], tlang))), axis=1) err_df['unicode_out'] = err_df.apply( lambda x: ('{:2x}'.format(isc.get_offset(x['out_char'], tlang))), axis=1) if tlang == 'ar': err_df['roman_ref'] = err_df.apply( lambda x: (a2r_xlit.transliterate(x['ref_char'])), axis=1) err_df['roman_out'] = err_df.apply( lambda x: (a2r_xlit.transliterate(x['out_char'])), axis=1) err_df['unicode_ref'] = err_df.apply( lambda x: ('{:4x}'.format(ord(x['ref_char']))), axis=1) err_df['unicode_out'] = err_df.apply( lambda x: ('{:4x}'.format(ord(x['out_char']))), axis=1) if align.cci.is_supported_language(tlang): err_df['charcat_ref'] = err_df.apply( lambda x: align.cci.get_char_type(x['ref_char'], tlang), axis=1) err_df['charcat_out'] = err_df.apply( lambda x: align.cci.get_char_type(x['out_char'], tlang), axis=1) err_df.sort_values(by='count', axis=0, ascending=False, inplace=True) err_df.to_csv('{}/err_count.csv'.format(out_dirname), encoding='utf-8') else: print 'WARNING (run_sort_errors): Could not analyze following experiment: {} {} {} {} {} epoch: {}'.format( rec['dataset'], rec['exp'], rec['representation'], slang, tlang, epoch) print 'End Experiment: ' + exp_dirname