def ngram_count_patterns(freq_lexicon, candidates): MAX_NGRAMS = 3 cur_line = 0 sl_tl_defaults = {} sl_tl = {} ngrams = {} meevents = {} # events[slword][counter] = [feat, feat, feat] meoutcomes = {} # meoutcomes[slword][counter] = tlword event_counter = 0 features = {} # features[(slword, ['a', 'list'], tlword)] = 3 feature_counter = 0 indexes = {} trad_counter = {} for line in open(freq_lexicon, 'r').readlines(): if len(line) < 1: continue w = int(line.split(' ')[0]) if w < THRESHOLD: continue row = common.tokenise_tagger_line(line) sl = wrap(row[0]).lower() tl = wrap(row[1]).lower() if tl[1] == '*': tl = tl[:-3] + '$' if sl not in sl_tl: sl_tl[sl] = [] if sl not in trad_counter: trad_counter[sl] = 0 if line.count('@') > 0: sl_tl_defaults[sl] = tl sl_tl[sl].append(tl) indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] = trad_counter[sl] + 1 cur_sl_row = [] cur_tl_row = [] cur_bt_row = [] cur_al_row = [] for line in open(candidates, 'r').readlines(): line = line.strip() if line[0] == '-': # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations # # sl_tl[sl_word][tl_word] = tl_freq i = 0 for slword in cur_sl_row: if len(cur_bt_row[i]['tls']) > 1: for al in cur_al_row: al_sl = int(al.split('-')[1]) al_tl = int(al.split('-')[0]) if al_sl != i: continue tlword = wrap(cur_tl_row[al_tl].lower()) slword = wrap(slword.lower()) if tlword[1] == '*' or slword[1] == '*': continue if slword not in sl_tl_defaults: # print >>sys.stderr, 'WARNING: "' + slword + '" not in sl_tl_defaults, skipping' continue if (slword, tlword) not in indexes: # print >>sys.stderr, 'WARNING: pair (%s, %s) not found in index' % (slword, tlword) continue # if tlword != sl_tl_defaults[slword]: # print >>sys.stderr, '+' , slword , sl_tl_defaults[slword] , tlword # else: # print >>sys.stderr, '-' , slword , sl_tl_defaults[slword] , tlword # print >>sys.stderr, cur_sl_row for j in range(1, MAX_NGRAMS): # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+1] # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i:i+j+1] # print >>sys.stderr, cur_sl_row[i] , cur_sl_row[i-j:i+j+1] pregram = ' '.join(map(wrap, cur_sl_row[i-j:i+1])) postgram = ' '.join(map(wrap, cur_sl_row[i:i+j+1])) roundgram = ' '.join( map(wrap, cur_sl_row[i-j:i+j+1])) if slword not in ngrams: ngrams[slword] = {} if pregram not in ngrams[slword]: ngrams[slword][pregram] = {} if postgram not in ngrams[slword]: ngrams[slword][postgram] = {} if roundgram not in ngrams[slword]: ngrams[slword][roundgram] = {} if tlword not in ngrams[slword][pregram]: ngrams[slword][pregram][tlword] = 0 if tlword not in ngrams[slword][postgram]: ngrams[slword][postgram][tlword] = 0 if tlword not in ngrams[slword][roundgram]: ngrams[slword][roundgram][tlword] = 0 ngrams[slword][pregram][tlword] = ngrams[slword][pregram][tlword] + 1 ngrams[slword][postgram][tlword] = ngrams[slword][postgram][tlword] + 1 ngrams[slword][roundgram][tlword] = ngrams[slword][roundgram][tlword] + 1 # print ',' , len(ngrams[slword]) if slword not in meevents: meevents[slword] = {} if slword not in meoutcomes: meoutcomes[slword] = {} if event_counter not in meevents: meevents[slword][event_counter] = [] if event_counter not in meoutcomes[slword]: meoutcomes[slword][event_counter] = '' for ni in ngrams[slword]: if ni not in features: feature_counter = feature_counter + 1 features[ni] = feature_counter meevents[slword][event_counter].append( features[ni]) # meevents[slword][event_counter].append(feat) meoutcomes[slword][event_counter] = tlword del ngrams ngrams = {} if len(sl_tl[slword]) < 2: continue for event in meevents[slword]: outline = str( indexes[(slword, meoutcomes[slword][event])]) + ' # ' for j in range(0, len(sl_tl[slword])): for feature in meevents[slword][event]: outline = outline + \ str(feature) + ':' + str(j) + ' ' outline = outline + ' # ' print(slword, '\t', len( sl_tl[slword]), '\t', outline) del meevents del meoutcomes meevents = {} meoutcomes = {} # for f in features: # print >>sys.stderr, features[f] , f # for j in range(0, MAX_NGRAMS): # print cur_sl_row[i-j:i+1] # print cur_sl_row[i:i+j] # print ngrams[slword] i = i + 1 cur_line = 0 event_counter = event_counter + 1 # print line continue line = line.split('\t')[1] line = line.strip() if cur_line == 0: cur_sl_row = common.tokenise_tagger_line(line) elif cur_line == 1: cur_bt_row = common.tokenise_biltrans_line(line) elif cur_line == 2: cur_tl_row = common.tokenise_tagger_line(line) elif cur_line == 3: cur_al_row = line.split(' ') cur_line = cur_line + 1 for feature in features: print(features[feature], '\t', feature, file=sys.stderr) # exit(1) return
while reading: #{ try: lineno = lineno + 1 pt_line = phrase_table.readline().strip() bt_line = biltrans_out.readline().strip() if bt_line == '' and pt_line == '': #{ reading = False #} row = pt_line.split('|||') sl = common.tokenise_tagger_line(row[1]) tl = common.tokenise_tagger_line(row[0]) alignments = row[2].strip() bt = common.tokenise_biltrans_line(bt_line) if not ambiguous(bt): #{ continue #} if len(sl) < 2 and len(tl) < 2: #{ continue #} # Here we collect a set of SL words, with their correspondences in the bilingual # dictionary, and the word they have been aligned with in the target. # e.g. words[0] = ('sl', ['bt1', 'bt2', ...], 'tl') translations = {} i = 0 for j in alignments.split(' '): #{
while reading: #{ try: lineno = lineno + 1 pt_line = phrase_table.readline().strip() bt_line = biltrans_out.readline().strip() if not bt_line.strip() and not pt_line.strip(): #{ reading = False break elif not bt_line.strip() or not pt_line.strip(): #{ continue #} row = pt_line.split('|||') bt = common.tokenise_biltrans_line(bt_line.strip()) sl = common.tokenise_tagger_line(row[1].strip()) tl = common.tokenise_tagger_line(row[0].strip()) if not ambiguous(bt): #{ not_ambiguous.append(str(lineno)) if len(not_ambiguous) >= 10: #{ print("not ambiguous:", ' '.join(not_ambiguous), file=sys.stderr) not_ambiguous = [] #} continue #} if len(sl) < 2 and len(tl) < 2: #{ continue
sl_tl[slword][tlword] = sl_tl[slword][tlword] + 1 # print '+' , slword , tlword , sl_tl[slword][tlword], lineno; #} #} i = i + 1 #} cur_line = 0 continue #} line = line.split('\t')[1] if cur_line == 0: #{ cur_sl_row = common.tokenise_tagger_line(line) elif cur_line == 1: #{ cur_bt_row = common.tokenise_biltrans_line(line) elif cur_line == 2: #{ cur_tl_row = common.tokenise_tagger_line(line) elif cur_line == 3: #{ cur_al_row = line.split(' ') #} cur_line = cur_line + 1 #} #} for sl in sl_tl: #{ newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() first = True
def extract_freq_lexicon(canditates): cur_line = 0 lineno = 0 sl_tl = {} cur_sl_row = [] cur_tl_row = [] cur_bt_row = [] cur_al_row = [] with open(canditates) as infile: for line in infile: line = line.strip() lineno += 1 if lineno % 5000 == 0: sys.stderr.write('.') if lineno % 100000 == 0: sys.stderr.write(str(lineno) + '\n') sys.stderr.flush() try: if line[0] == '-': # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations # # sl_tl[sl_word][tl_word] = tl_freq i = 0 for slword in cur_sl_row: if len(cur_bt_row[i]['tls']) > 1: for al in cur_al_row: if al == '': continue al_sl = int(al.split('-')[1]) al_tl = int(al.split('-')[0]) if al_sl != i: continue if al_tl < len(cur_tl_row): tlword = cur_tl_row[al_tl] else: tlword = cur_tl_row[-1] traceback.print_stack() print("alignment out", "of", "range", al_tl, "not in", "len(", cur_tl_row, ")", file=sys.stderr) exit(1) slword = slword if slword not in sl_tl: sl_tl[slword] = {} if tlword not in sl_tl[slword]: sl_tl[slword][tlword] = 0 sl_tl[slword][ tlword] = sl_tl[slword][tlword] + 1 # print '+' , slword , tlword , sl_tl[slword][tlword], lineno i = i + 1 cur_line = 0 continue line = line.split('\t')[1] if cur_line == 0: cur_sl_row = common.tokenise_tagger_line(line) elif cur_line == 1: cur_bt_row = common.tokenise_biltrans_line(line) elif cur_line == 2: cur_tl_row = common.tokenise_tagger_line(line) elif cur_line == 3: cur_al_row = line.split(' ') cur_line = cur_line + 1 except Exception: # print("Error in line", lineno, ":", e, file=sys.stderr) traceback.print_exc() exit(1) for sl in sl_tl: newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() first = True for tl in newtl: if tl[0] == '*': print('Error: tl word unknown', tl, file=sys.stderr) continue first_tag_sl = sl.split('<')[1].split('>')[0].strip() first_tag_tl = tl.split('<')[1].split('>')[0].strip() if first_tag_sl != first_tag_tl: print('Error:', first_tag_sl, '!=', first_tag_tl, file=sys.stderr) continue if first: print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl), '@') first = False else: print(sl_tl[sl][tl], common.wrap(sl), common.wrap(tl))
def extract_sentences(phrase_table_file, biltrans_out_file): lineno = 0 total_valid = 0 total_errors = 0 not_ambiguous = [] with open(phrase_table_file) as phrase_table, open( biltrans_out_file) as biltrans_out: while True: try: lineno = lineno + 1 pt_line = phrase_table.readline().strip() bt_line = biltrans_out.readline().strip() if not bt_line.strip() and not pt_line.strip(): break elif not bt_line.strip() or not pt_line.strip(): continue row = pt_line.split('|||') bt = common.tokenise_biltrans_line(bt_line.strip()) sl = common.tokenise_tagger_line(row[1].strip()) tl = common.tokenise_tagger_line(row[0].strip()) if not common.ambiguous(bt): not_ambiguous.append(str(lineno)) if len(not_ambiguous) >= 10: print("not ambiguous:", ' '.join(not_ambiguous), file=sys.stderr) not_ambiguous = [] continue if len(sl) < 2 and len(tl) < 2: continue # Check that the number of words in the lexical transfer, and in the phrasetable matches up if len(sl) != len(bt): print("Error in line", lineno, ": len(sl) != len(bt)", file=sys.stderr) continue # cheking if the alignments are empty if not row[2].strip(): print("In line", lineno, ", alignments are empty", file=sys.stderr) continue # Resumption<n> of<pr> the<def><def> session<n> # Resumption<n><sg>/Reanudación<n><f><sg> of<pr>/de<pr> the<det><def><sp>/el<det><def><GD><ND> session<n><sg>/sesión<n><f><sg> # Reanudación<n> de<pr> el<det><def> periodo<n> de<pr> sesión<n> # 0-0 1-1 2-2 5-3 print(lineno, '\t' + row[1]) print(lineno, '\t' + bt_line) print(lineno, '\t' + row[0]) print(lineno, '\t' + row[2]) print('-' * (len(bt_line) + 5)) total_valid += 1 except Exception as e: print("Error in line", lineno, ": ", e, file=sys.stderr) total_errors += 1 continue print('total:', lineno, file=sys.stderr) print('valid:', total_valid, '(' + str((total_valid / lineno) * 100) + '%)', file=sys.stderr) print('errors:', total_errors, '(' + str((total_errors / lineno) * 100) + '%)', file=sys.stderr)
def ngram_count_patterns(freq_lexicon, candidates, crisphold, max_rules): MAX_NGRAMS = 2 cur_line = 0 sl_tl_defaults = {} sl_tl = {} ngrams = {} lineno = 0 for line in open(freq_lexicon).readlines(): lineno += 1 if lineno % 10000 == 0: print(lineno, file=sys.stderr) if len(line) < 1: continue row = common.tokenise_tagger_line(line) sl = common.wrap(row[0]) tl = common.wrap(row[1]) if tl[1] == '*': tl = tl[:-3] + '$' if line.count('@') > 0: sl_tl_defaults[sl] = tl else: sl_tl[sl] = tl cur_sl_row = [] cur_tl_row = [] cur_bt_row = [] cur_al_row = [] lineno = 0 for line in open(candidates).readlines(): lineno += 1 line = line.strip() if lineno % 500 == 0: print(lineno, file=sys.stderr) if line[0] == '-': # print len(cur_sl_row), len(cur_tl_row), len(cur_bt_row), len(cur_al_row) # print cur_sl_row # print cur_bt_row # print cur_tl_row # print cur_al_row # # Read the corpus, make a note of all ambiguous words, their frequency and their possible translations # # sl_tl[sl_word][tl_word] = tl_freq i = 0 for slword in cur_sl_row: if len(cur_bt_row[i]['tls']) > 1: for al in cur_al_row: if al == '': continue al_sl = int(al.split('-')[1]) al_tl = int(al.split('-')[0]) if al_sl != i: continue tlword = common.wrap(cur_tl_row[al_tl]) slword = common.wrap(slword) if slword not in sl_tl_defaults: print('!', file=sys.stderr) continue for j in range(1, MAX_NGRAMS): pregram = ' '.join( map(common.wrap, cur_sl_row[i - j:i + 1])) postgram = ' '.join( map(common.wrap, cur_sl_row[i:i + j + 1])) roundgram = ' '.join( map(common.wrap, cur_sl_row[i - j:i + j + 1])) if slword not in ngrams: ngrams[slword] = {} if pregram not in ngrams[slword]: ngrams[slword][pregram] = {} if postgram not in ngrams[slword]: ngrams[slword][postgram] = {} if roundgram not in ngrams[slword]: ngrams[slword][roundgram] = {} if tlword not in ngrams[slword][pregram]: ngrams[slword][pregram][tlword] = 0 if tlword not in ngrams[slword][postgram]: ngrams[slword][postgram][tlword] = 0 if tlword not in ngrams[slword][roundgram]: ngrams[slword][roundgram][tlword] = 0 ngrams[slword][pregram][ tlword] = ngrams[slword][pregram][tlword] + 1 ngrams[slword][postgram][ tlword] = ngrams[slword][postgram][tlword] + 1 ngrams[slword][roundgram][ tlword] = ngrams[slword][roundgram][tlword] + 1 i = i + 1 cur_line = 0 # print line continue line = line.split('\t')[1] if cur_line == 0: cur_sl_row = common.tokenise_tagger_line(line) elif cur_line == 1: cur_bt_row = common.tokenise_biltrans_line(line) elif cur_line == 2: cur_tl_row = common.tokenise_tagger_line(line) elif cur_line == 3: cur_al_row = line.split(' ') cur_line = cur_line + 1 for sl in ngrams: for ngram in ngrams[sl]: total = 0 max_freq = -1 current_tl = '' newtl = sorted(ngrams[sl][ngram], key=lambda x: ngrams[sl][ngram][x]) newtl.reverse() newtl = newtl[:max_rules] for tl in newtl: if ngrams[sl][ngram][tl] > max_freq: max_freq = ngrams[sl][ngram][tl] current_tl = tl total = total + ngrams[sl][ngram][tl] # > If for each of the rules we include # > the amount of time the translation is seen with that pattern over the # > total, we get a number we can try as a threshold. e.g. > 0.6 >0.7 >0.8 # > etc. (>0.6 would be the same as 2/3 of the time the alternative # > translation is seen with that ngram, and 1/3 of the time the default # > translation is). I think this would be easier to explain than the magic # > number I came up with. # # I see this as a way to define how "crispy" the decisions are. I think it # would be better to express this as a ratio: the ratio of the times the # alternative translation is seen to the number of times the defaullt # translation is seen with that n-gram. # # It would be "2" in this case: the alternative is seen twice as often as # the default. for tl in newtl: crispiness = 0.0 default = sl_tl_defaults[sl] alt_crisp = float(ngrams[sl][ngram][tl]) / float(total) def_crisp = 1.0 if default in ngrams[sl][ngram]: def_crisp = float(ngrams[sl][ngram][default] / float(total)) weight = float(ngrams[sl][ngram][tl]) / float(total) crispiness = alt_crisp / def_crisp # print '%%%' , crispiness , alt_crisp , def_crisp , tl , default , ngrams[sl][ngram] if crispiness < crisphold: print( '-', crispiness, weight, total, max_freq, ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][tl])) else: print( '+', crispiness, weight, total, max_freq, ngrams[sl][ngram][tl], '\t' + sl + '\t' + ngram + '\t' + tl + '\t' + str(ngrams[sl][ngram][current_tl]))