def process_files(align_file, src_file, tgt_file, ref_file, dict_file, out_file, src_sgm, tgt_sgm, lang, is_reverse_alignment): """ """ tgt_inf = codecs.open(tgt_file, 'r', 'utf-8') is_src = 0 if src_file != '': is_src = 1 src_inf = codecs.open(src_file, 'r', 'utf-8') is_align = 0 if align_file != '': is_align = 1 align_inf = codecs.open(align_file, 'r', 'utf-8') is_ref = 0 if ref_file != '': ref_inf = codecs.open(ref_file, 'r', 'utf-8') is_ref = 1 # load dict is_dict = 0 if dict_file != '' and os.path.exists(dict_file): dict_map = load_dict(dict_file) is_dict = 1 # out_file if out_file == '': out_file = tgt_file + '.post' ouf = codecs.open(out_file, 'w', 'utf-8') new_tgt_file = tgt_file + '.new' new_tgt_ouf = codecs.open(new_tgt_file, 'w', 'utf-8') # post process unk = '<unk>' line_id = 0 debug = 1 unk_count = 0 dictionary_count = 0 identity_count = 0 for tgt_line in tgt_inf: tgt_line = tgt_line.strip() debug_count = 0 debug_str = '' if is_src: src_line = src_inf.readline().strip() if is_ref: ref_line = ref_inf.readline().strip() if is_align: src_tokens = re.split('\s+', src_line) tgt_tokens = re.split('\s+', tgt_line) # get alignment align_line = align_inf.readline().strip() if is_reverse_alignment==True: # reversed alignment tgtId-srcId (t2s, s2t) = text.aggregate_alignments(align_line) else: # normal alignment srcId-tgtId (s2t, t2s) = text.aggregate_alignments(align_line) new_tgt_tokens = [] for tgt_pos in xrange(len(tgt_tokens)): tgt_token = tgt_tokens[tgt_pos] if tgt_tokens[tgt_pos] == unk and is_dict: unk_count = unk_count + 1 if tgt_pos in t2s: # aligned unk debug_count = debug_count + 1 src_token = src_tokens[t2s[tgt_pos][0]] if src_token in dict_map: # there's a word-word translation tgt_token = dict_map[src_token] dictionary_count = dictionary_count + 1 if debug: debug_str = debug_str + "dict: " + src_token + " -> " + tgt_token + '\n' else: # identity copy tgt_token = src_token identity_count = identity_count + 1 if debug: debug_str = debug_str + "iden: " + src_token + " -> " + tgt_token + '\n' #if tgt_token != '##AT##-##AT##': new_tgt_tokens.append(tgt_token) out_line = ' '.join(new_tgt_tokens) else: out_line = tgt_line # post process if re.search('##AT##-##AT##', out_line): out_line = re.sub(' ##AT##-##AT## ', '-', out_line) tgt_line = re.sub(' ##AT##-##AT## ', '-', tgt_line) if is_align == 0: debug_count = 1 ouf.write('%s\n' % out_line) new_tgt_ouf.write('%s\n' % tgt_line) # debug info if debug == 1 and debug_count>0: sys.stderr.write('# example %d\n' % line_id) if is_src: sys.stderr.write('src: %s\n' % (src_line)) sys.stderr.write('tgt: %s\n' % (tgt_line)) sys.stderr.write('%s' % (debug_str)) sys.stderr.write('out: %s\n' % (out_line)) if is_ref: sys.stderr.write('ref: %s\n' % ref_line) debug = 0 line_id += 1 # concat results if is_src: src_inf.close() if is_align: align_inf.close() tgt_inf.close() ouf.close() new_tgt_ouf.close() sys.stderr.write('# num sents = %d, unk count=%d, dictionary_count=%d, identity_count=%d\n' % (line_id, unk_count, dictionary_count, identity_count)) # evaluating if is_ref: script_dir = os.path.dirname(sys.argv[0]) bleu(script_dir, new_tgt_file, ref_file) if is_align: chr_f(script_dir, out_file, ref_file) bleu(script_dir, out_file, ref_file) if src_sgm != '' and tgt_sgm != '' and lang != '': # compute NIST BLEU score nist_bleu(script_dir, out_file, src_sgm, tgt_sgm, lang)
def post_process(align_file, src_file, tgt_file, ref_file, dict_file, is_reverse_alignment): sys.stderr.write('# ref_file = %s\n' % ref_file) is_src = 0 if src_file != '': is_src = 1 src_inf = codecs.open(src_file, 'r', 'utf-8') tgt_inf = codecs.open(tgt_file, 'r', 'utf-8') is_align = 0 if align_file != '': is_align = 1 align_inf = codecs.open(align_file, 'r', 'utf-8') # post_file post_file = tgt_file + '.post' post_ouf = codecs.open(post_file, 'w', 'utf-8') else: post_file = '' is_ref = 0 if ref_file != '': ref_inf = codecs.open(ref_file, 'r', 'utf-8') is_ref = 1 # load dict is_dict = 0 if dict_file != '': dict_map = load_dict(dict_file) is_dict = 1 # pre_file pre_file = tgt_file + '.pre' pre_ouf = codecs.open(pre_file, 'w', 'utf-8') # post process unk = '<unk>' line_id = 0 debug = 1 unk_count = 0 dictionary_count = 0 identity_count = 0 for tgt_line in tgt_inf: tgt_line = tgt_line.strip() debug_count = 0 debug_str = '' if is_src: src_line = src_inf.readline().strip() if is_ref: ref_line = ref_inf.readline().strip() if is_align: src_tokens = re.split('\s+', src_line) tgt_tokens = re.split('\s+', tgt_line) # get alignment align_line = align_inf.readline().strip() if is_reverse_alignment==True: # reversed alignment tgtId-srcId (t2s, s2t) = text.aggregate_alignments(align_line) else: # normal alignment srcId-tgtId (s2t, t2s) = text.aggregate_alignments(align_line) new_tgt_tokens = [] for tgt_pos in xrange(len(tgt_tokens)): tgt_token = tgt_tokens[tgt_pos] if tgt_tokens[tgt_pos] == unk and is_dict: unk_count = unk_count + 1 if tgt_pos in t2s: # aligned unk debug_count = debug_count + 1 src_token = src_tokens[t2s[tgt_pos][0]] if src_token in dict_map: # there's a word-word translation tgt_token = dict_map[src_token] dictionary_count = dictionary_count + 1 if debug: debug_str = debug_str + "dict: " + src_token + " -> " + tgt_token + '\n' else: # identity copy tgt_token = src_token identity_count = identity_count + 1 if debug: debug_str = debug_str + "iden: " + src_token + " -> " + tgt_token + '\n' new_tgt_tokens.append(tgt_token) post_line = ' '.join(new_tgt_tokens) # escape ##AT##-##AT## en-de pair (for historical reason) #if re.search('##AT##-##AT##', tgt_line): tgt_line = re.sub(' ##AT##-##AT## ', '-', tgt_line) if is_align == 0: debug_count = 1 pre_ouf.write('%s\n' % tgt_line) if is_align: post_line = re.sub(' ##AT##-##AT## ', '-', post_line) post_ouf.write('%s\n' % post_line) # debug info if debug == 1 and debug_count>0: sys.stderr.write('# example %d\n' % line_id) if is_src: sys.stderr.write('src: %s\n' % (src_line)) sys.stderr.write('tgt: %s\n' % (tgt_line)) sys.stderr.write('%s' % (debug_str)) if is_align: sys.stderr.write('out: %s\n' % (post_line)) if is_ref: sys.stderr.write('ref: %s\n' % ref_line) debug = 0 line_id += 1 # concat results if is_src: src_inf.close() if is_align: align_inf.close() tgt_inf.close() post_ouf.close() pre_ouf.close() sys.stderr.write('# num sents = %d, unk count=%d, dictionary_count=%d, identity_count=%d\n' % (line_id, unk_count, dictionary_count, identity_count)) return (pre_file, post_file)
def process_files(in_prefix, src_lang, tgt_lang, out_prefix, freq, opt, src_vocab_size, tgt_vocab_size, unk_symbol='<unk>'): """ """ # input sys.stderr.write('# Input from %s.*\n' % (in_prefix)) src_file = in_prefix + '.' + src_lang src_inf = codecs.open(src_file, 'r', 'utf-8') tgt_file = in_prefix + '.' + tgt_lang tgt_inf = codecs.open(tgt_file, 'r', 'utf-8') align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8') if src_vocab_size>0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str(src_vocab_size) elif freq>0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq) (src_words, src_vocab_map, src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq, src_vocab_size, unk_symbol) if tgt_vocab_size>0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str(tgt_vocab_size) elif freq>0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq) (tgt_words, tgt_vocab_map, tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq, tgt_vocab_size, unk_symbol) # process corpus line_id = 0 debug = True bi_counts = {} # bi_counts[src_id][tgt_id] src_counts = {} tgt_counts = {} total_count = 0 # total alignment links for src_line in src_inf: src_line = src_line.strip() tgt_line = tgt_inf.readline().strip() src_tokens = re.split('\s+', src_line) tgt_tokens = re.split('\s+', tgt_line) if opt==1: # reversed alignment tgtId-srcId (t2s, s2t) = text.aggregate_alignments(align_inf.readline()) else: # normal alignment srcId-tgtId (s2t, t2s) = text .aggregate_alignments(align_inf.readline()) # process alignments for tgt_pos in t2s.keys(): for src_pos in t2s[tgt_pos]: # same word src_token = src_tokens[src_pos] tgt_token = tgt_tokens[tgt_pos] if src_token in src_vocab_map and tgt_token in tgt_vocab_map: # both known src_id = src_vocab_map[src_token] tgt_id = tgt_vocab_map[tgt_token] if src_id not in bi_counts: bi_counts[src_id] = {} src_counts[src_id] = 0 if tgt_id not in tgt_counts: tgt_counts[tgt_id] = 0 if tgt_id not in bi_counts[src_id]: bi_counts[src_id][tgt_id] = 0 # update bi_counts[src_id][tgt_id] += 1 src_counts[src_id] += 1 tgt_counts[tgt_id] += 1 total_count += 1 line_id = line_id + 1 if (line_id % 100000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write(' num lines=%d, total links=%d\n' % (line_id, total_count)) # output check_dir(out_prefix) dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict' dict_ouf = codecs.open(dict_file, 'w', 'utf-8') sys.stderr.write('# Output to %s*\n' % dict_file) # compute src_probs src_probs = {} for src_id in src_counts.keys(): src_probs[src_id] = float(src_counts[src_id])/float(total_count) # compute tgt_probs tgt_probs = {} for tgt_id in tgt_counts.keys(): tgt_probs[tgt_id] = float(tgt_counts[tgt_id])/float(total_count) # compute joint prob for src_id in bi_counts.keys(): for tgt_id in bi_counts[src_id].keys(): bi_count = bi_counts[src_id][tgt_id] if bi_count<10: continue p_src_given_tgt = float(bi_count)/float(tgt_counts[tgt_id]) p_tgt_given_src = float(bi_count)/float(src_counts[src_id]) # normalized pmi p_src_tgt = float(bi_count)/float(total_count) # joint p_src = src_probs[src_id] p_tgt = tgt_probs[tgt_id] pmi = math.log(p_src_tgt/(p_src*p_tgt)) npmi = - pmi / math.log(p_src_tgt) # print src_token = src_words[src_id] tgt_token = tgt_words[tgt_id] dict_ouf.write('%s %s %g %g %g %g %g\n' % (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt, (p_src_given_tgt+p_tgt_given_src)/2, pmi, npmi)) #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2)) #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words) #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words) src_inf.close() tgt_inf.close() align_inf.close() dict_ouf.close()
def process_files(in_prefix, src_lang, tgt_lang, out_prefix, freq, opt, src_vocab_size, tgt_vocab_size, unk_symbol='<unk>'): """ """ # input sys.stderr.write('# Input from %s.*\n' % (in_prefix)) src_file = in_prefix + '.' + src_lang src_inf = codecs.open(src_file, 'r', 'utf-8') tgt_file = in_prefix + '.' + tgt_lang tgt_inf = codecs.open(tgt_file, 'r', 'utf-8') align_inf = codecs.open(in_prefix + '.align', 'r', 'utf-8') if src_vocab_size > 0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.' + str( src_vocab_size) elif freq > 0: src_vocab_file = in_prefix + '.' + src_lang + '.vocab.f' + str(freq) (src_words, src_vocab_map, src_vocab_size) = text.get_vocab(src_file, src_vocab_file, freq, src_vocab_size, unk_symbol) if tgt_vocab_size > 0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.' + str( tgt_vocab_size) elif freq > 0: tgt_vocab_file = in_prefix + '.' + tgt_lang + '.vocab.f' + str(freq) (tgt_words, tgt_vocab_map, tgt_vocab_size) = text.get_vocab(tgt_file, tgt_vocab_file, freq, tgt_vocab_size, unk_symbol) # process corpus line_id = 0 debug = True bi_counts = {} # bi_counts[src_id][tgt_id] src_counts = {} tgt_counts = {} total_count = 0 # total alignment links for src_line in src_inf: src_line = src_line.strip() tgt_line = tgt_inf.readline().strip() src_tokens = re.split('\s+', src_line) tgt_tokens = re.split('\s+', tgt_line) if opt == 1: # reversed alignment tgtId-srcId (t2s, s2t) = text.aggregate_alignments(align_inf.readline()) else: # normal alignment srcId-tgtId (s2t, t2s) = text.aggregate_alignments(align_inf.readline()) # process alignments for tgt_pos in t2s.keys(): for src_pos in t2s[tgt_pos]: # same word src_token = src_tokens[src_pos] tgt_token = tgt_tokens[tgt_pos] if src_token in src_vocab_map and tgt_token in tgt_vocab_map: # both known src_id = src_vocab_map[src_token] tgt_id = tgt_vocab_map[tgt_token] if src_id not in bi_counts: bi_counts[src_id] = {} src_counts[src_id] = 0 if tgt_id not in tgt_counts: tgt_counts[tgt_id] = 0 if tgt_id not in bi_counts[src_id]: bi_counts[src_id][tgt_id] = 0 # update bi_counts[src_id][tgt_id] += 1 src_counts[src_id] += 1 tgt_counts[tgt_id] += 1 total_count += 1 line_id = line_id + 1 if (line_id % 100000 == 0): sys.stderr.write(' (%d) ' % line_id) sys.stderr.write(' num lines=%d, total links=%d\n' % (line_id, total_count)) # output check_dir(out_prefix) dict_file = out_prefix + '.' + src_lang + '-' + tgt_lang + '.dict' dict_ouf = codecs.open(dict_file, 'w', 'utf-8') sys.stderr.write('# Output to %s*\n' % dict_file) # compute src_probs src_probs = {} for src_id in src_counts.keys(): src_probs[src_id] = float(src_counts[src_id]) / float(total_count) # compute tgt_probs tgt_probs = {} for tgt_id in tgt_counts.keys(): tgt_probs[tgt_id] = float(tgt_counts[tgt_id]) / float(total_count) # compute joint prob for src_id in bi_counts.keys(): for tgt_id in bi_counts[src_id].keys(): bi_count = bi_counts[src_id][tgt_id] if bi_count < 10: continue p_src_given_tgt = float(bi_count) / float(tgt_counts[tgt_id]) p_tgt_given_src = float(bi_count) / float(src_counts[src_id]) # normalized pmi p_src_tgt = float(bi_count) / float(total_count) # joint p_src = src_probs[src_id] p_tgt = tgt_probs[tgt_id] pmi = math.log(p_src_tgt / (p_src * p_tgt)) npmi = -pmi / math.log(p_src_tgt) # print src_token = src_words[src_id] tgt_token = tgt_words[tgt_id] dict_ouf.write( '%s %s %g %g %g %g %g\n' % (src_token, tgt_token, p_tgt_given_src, p_src_given_tgt, (p_src_given_tgt + p_tgt_given_src) / 2, pmi, npmi)) #dict_ouf.write('%s %s %g\n' % (src_token, tgt_token, (p_src_given_tgt+p_tgt_given_src)/2)) #text.write_vocab(out_prefix + '.vocab.' + src_lang, src_words) #text.write_vocab(out_prefix + '.vocab.' + tgt_lang, tgt_words) src_inf.close() tgt_inf.close() align_inf.close() dict_ouf.close()