def read_frequencies(fname): with open(fname) as fin: sl_tl = {} sl_tl_defaults = {} indexes = {} trad_counter = defaultdict(lambda: 0) for line_ in fin.readlines(): line = line_.strip() if not line: continue row = common.tokenize_tagger_line(line) sl = row[0] tl = row[1] fr = float(line.split(' ')[0]) indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] += 1 if '@' in line: sl_tl_defaults[sl] = tl if fr == 0.0: print( '!!! Prolly something went wrong here, the default has freq of 0.0', file=sys.stderr) else: print(' %s => %s = %.10f' % (sl, tl, fr), file=sys.stderr) else: sl_tl[sl] = tl return sl_tl, sl_tl_defaults, indexes
def extract_alig_lrx(lex_freq): with open(lex_freq) as d: print('<rules>') for line in d: sys.stdout.flush() if line[-2] == '@': row = common.tokenize_tagger_line(line) fq = line.split(' ')[0] sl = row[0] tl = row[1] if line.count('>') < 2: continue print(sl, tl, file=sys.stderr) sl_lem = sl.split('<')[0] tl_lem = tl.split('<')[0] sl_lem = sl_lem.replace('-', '\\-').replace('~', ' ').replace( '&', '&') tl_lem = tl_lem.replace('-', '\\-').replace('~', ' ').replace( '&', '&') sl_tag = sl.replace('><', '.').split('<')[1].strip('>') tl_tag = tl.replace('><', '.').split('<')[1].strip('>') cmb = '' cma = '' if sl_tag.split('.')[0] not in ['adj', 'vblex', 'n']: cmb = '<!--' cma = '-->' else: cma = '' cmb = '' rule = cmb + '<rule comment="' + fq + '">' # rule = rule + '<match lemma="' + sl_lem + '" tags="' + sl_tag + '"><select lemma="' + tl_lem + '" tags="' + tl_tag + '"/>' rule = rule + '<match lemma="' + sl_lem + '"><select lemma="' + tl_lem + '"/>' rule = rule + '</match>' rule = rule + '</rule>' + cma print(rule) print('</rules>')
sl_tl_defaults = {} sl_tl = defaultdict(list) features = {} # features[(slword, ['a', 'list'], tlword)] = 3 indexes = {} trad_counter = defaultdict(lambda: 0) # First read in the frequency defaults for line in open(sys.argv[1]): line = line.strip() if len(line) < 1: continue row = common.tokenize_tagger_line(line) sl = common.wrap(row[0]) tl = common.wrap(row[1]) if tl[1] == '*': tl = tl[:-3] + '$' indexes[(sl, tl)] = trad_counter[sl] trad_counter[sl] += 1 sl_tl[sl].append(tl) if line.count('@') > 0: sl_tl_defaults[sl] = tl class Counter(BCC.BiltransCounter): tokenizer = 'biltrans'
while reading: #{ try: lineno = lineno + 1; pt_line = phrase_table.readline().strip(); bt_line = biltrans_out.readline().strip(); if not bt_line.strip() and not pt_line.strip(): #{ reading = False; break #} elif not bt_line.strip() or not pt_line.strip(): continue; row = pt_line.split(' ||| '); bt = common.tokenize_biltrans_line(bt_line); sl = common.tokenize_tagger_line(row[1]); tl = common.tokenize_tagger_line(row[0]); if not ambiguous(bt): #{ print ("line", lineno, "not ambiguous", file=sys.stderr); continue; #} if len(sl) < 2 and len(tl) < 2: #{ continue; #} # Check that the number of words in the lexical transfer, and in the phrasetable matches up if len(sl) != len(bt): #{ print ("len(sl) != len(bt)", file=sys.stderr);
dm_file = open(sys.argv[2]); # File with tagger output reading = True; lineno = 0; while reading: #{ lineno += 1 if (lineno % 1000 == 0): print ("at line no: ", lineno, file=sys.stderr); am_line = am_file.readline(); dm_line = dm_file.readline(); if am_line == '': #{ reading = False; continue; #} try: am_row = common.tokenize_biltrans_line(am_line); dm_row = set(common.tokenize_tagger_line(dm_line)); except: continue; cur_sl_row = [x['sl'] for x in am_row]; for i in range(0, len(am_row)): #{ sl = am_row[i]['sl']; tls = am_row[i]['tls']; if len(tls) < 2: #{ continue; #} for tl in found_tls(tls, dm_row): #{ for j in range(1, MAX_NGRAMS): #{
def wrap (x): return '^' + x + '$' sl_tl_defaults = {}; sl_tl = {}; indexes = {}; trad_counter = {}; rindex = {}; with open(sys.argv[1]) as d: for line in d: #{ if len(line) < 1: #{ continue; #} row = common.tokenize_tagger_line(line); sl = wrap(row[0].strip()); tl = wrap(row[1].strip()); if tl[1] == '*': tl = tl[:-3] + '$' if sl not in sl_tl: #{ sl_tl[sl] = []; #} if sl not in trad_counter: #{ trad_counter[sl] = 0; #} if line.count('@') > 0: #{ sl_tl_defaults[sl] = tl; #} sl_tl[sl].append(tl);
def ngrams_to_rules(ngrams, crisphold): permitted_tags = ['n', 'vblex', 'adj', 'n.*', 'vblex.*', 'adj.*'] print('<rules>') lineno = 1 ruleno = 0 for line in open(ngrams).readlines(): # print('\n') # print(line) if len(line) < 2: continue line = line.strip() #line = line.strip() # + 0.571428571429 14 8 8 troiñ<vblex> tourner<vblex> 8 row = line.split('\t') if len(row) == 3: row.insert(0, '') # tipus = row[0].split(' ')[0] weight = row[0].split(' ')[1] sl = row[1].strip()[1:-1] tl = row[3][1:-1] tl_lema = tl.split('<')[0].lower() tl_tags = '<'.join(tl.split('<')[1:]).replace('><', '.').replace('>', '') freq = row[4] pattern = common.tokenize_tagger_line(row[2]) if row[2].count('<guio>') > 0 or row[2].count( '<sent>') > 0 or row[2].count('<cm>') > 0: print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr) continue inpattern = False for w in pattern: if w.count(sl) > 0: inpattern = True if inpattern == False: print('SL_NOT_IN_PATTERN', line, sl, tl, file=sys.stderr) continue if tl_tags.count('adj') > 0 and sl.count('adj') < 1: print("TAG_MISMATCH", line, file=sys.stderr) continue if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: print("TAG_MISMATCH", line, file=sys.stderr) continue if tl_tags.split('.')[0] not in permitted_tags: print("TAG_NOT_PERMITTED", tl_tags, '||', line, file=sys.stderr) continue if float(weight) <= float(crisphold): print("UNDER_THRESHOLD", weight, "<", crisphold, "||", line, file=sys.stderr) continue if any([x.startswith("*") for x in pattern]): print("UNKNOWN_WORD_IN_PATTERN", pattern, file=sys.stderr) continue sel = False ruleno = ruleno + 1 lineno = lineno + 1 print(' <rule c="' + str(ruleno) + ' ' + str(lineno) + ': ' + freq + '" weight="' + weight + '">') for word in pattern: sl_lema = word.split('<')[0].lower() if (sl_lema[0] == '*'): continue if word.count('><') > 0: sl_tags = '<'.join(word.split('<')[1:]).replace('><', '.').replace( '>', '') else: sl_tags = '<'.join(word.split('<')[1:]).strip('<>') # ======================================================================= # sl_lema = sl_lema.replace('~', ' ') tl_lema = tl_lema.replace('~', ' ') sl_lema = sl_lema.replace('-', '\-') tl_lema = tl_lema.replace('-', '\-') sl_lema = sl_lema.replace('(', '\(') tl_lema = tl_lema.replace('(', '\(') sl_lema = sl_lema.replace(')', '\)') tl_lema = tl_lema.replace(')', '\)') if word.lower().count(sl) > 0: lineno = lineno + 1 if sl_lema == '': print(' <match tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>') else: print(' <match lemma="' + sl_lema + '" tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>') sel = True else: lineno = lineno + 1 if sl_lema == '': print(' <match tags="' + sl_tags + '"/>') else: print(' <match lemma="' + sl_lema + '" tags="' + sl_tags + '"/>') if sel == False: print(' </rule> <!-- Warning: No select operation ', line, '-->') else: print(' </rule>') lineno = lineno + 1 print('</rules>')
#} #} i = i + 1; #} cur_line = 0; except: print >>sys.stderr, "error in line", lineno; #print line; continue; #} line = line.split('\t')[1]; if cur_line == 0: #{ cur_sl_row = common.tokenize_tagger_line(line); elif cur_line == 1: #{ cur_bt_row = common.tokenize_biltrans_line(line); elif cur_line == 2: #{ cur_tl_row = common.tokenize_tagger_line(line); elif cur_line == 3: #{ cur_al_row = line.split(' '); #} cur_line = cur_line + 1; #} for sl in sl_tl: #{ newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse()
#} MAX_NGRAMS = 3; crisphold = float(sys.argv[3]); cur_line = 0; sl_tl_defaults = {}; sl_tl = {}; ngrams = {}; for line in file(sys.argv[1]).readlines(): #{ if len(line) < 1: #{ continue; #} row = common.tokenize_tagger_line(line.decode('utf-8')); sl = wrap(row[0]); tl = wrap(row[1]); if tl[1] == '*': tl = tl[:-3] + '$' if line.count('@') > 0: #{ sl_tl_defaults[sl] = tl; else: #{ sl_tl[sl] = tl; #} #} cur_sl_row = []; cur_tl_row = []; cur_bt_row = []; cur_al_row = [];
#+ 0.571428571429 14 8 8 troiñ<vblex> tourner<vblex> 8 row = line.split('\t') if len(row) == 3: row.insert(0, '') # tipus = row[0].split(' ')[0]; weight = row[0].split(' ')[1] sl = row[1].strip()[1:-1] tl = row[3][1:-1] tl_lema = tl.split('<')[0].lower() tl_tags = '<'.join(tl.split('<')[1:]).replace('><', '.').replace('>', '') freq = row[4] pattern = common.tokenize_tagger_line(row[2]) if row[2].count('<guio>') > 0 or row[2].count( '<sent>') > 0 or row[2].count('<cm>') > 0: #{ print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr) continue #} inpattern = False for w in pattern: #{ if w.count(sl) > 0: #{ inpattern = True #} #} if inpattern == False: #{ print('SL_NOT_IN_PATTERN', line, sl, tl, file=sys.stderr)
# File with ambiguous biltrans output dm_file = open(sys.argv[2]) # File with biltrans output reading = True while reading: #{ am_line = am_file.readline() dm_line = dm_file.readline() if am_line == '' and dm_line == '': #{ reading = False continue #} try: am_row = common.tokenize_biltrans_line(am_line) dm_row = common.tokenize_tagger_line(dm_line) except: continue limit = len(am_row) for i in range(0, limit): #{ if len(am_row[i]['tls']) > 1: #{ sl = am_row[i]['sl'] if sl not in sl_tl: #{ sl_tl[sl] = {} #} bts = am_row[i]['tls'] valid_trads = set() for bt in bts: #{ valid_trads.add(bt) #}
while reading: # { lineno = lineno + 1 pt_line = phrase_table.readline().strip() bt_line = biltrans_out.readline().strip() if bt_line == "" and pt_line == "": # { reading = False # } if not ambiguous(bt_line): # { # print(lineno, ' not ambiguous.', file=sys.stderr); continue # } row = pt_line.split("|||") print(common.tokenize_tagger_line(row[0])) bt = bt_line.split() sl = row[1].strip() tl = row[0].strip() aliniaments = row[2].strip() bt_row = bt_line.split(" ") sl_row = sl.split(" ") tl_row = tl.split(" ") if len(sl_row) < 2 and len(tl_row) < 2: # { continue # } # Check that the number of words in the lexical transfer, and in the phrasetable matches up if len(sl_row) != len(bt_row): # {
line = line.strip(); #line = line.decode('utf-8').strip(); print(line, file=sys.stderr) #+ 0.571428571429 14 8 8 troiñ<vblex> tourner<vblex> 8 row = line.split('\t'); tipus = row[0].split(' ')[0]; weight = row[0].replace(' ', ' ').split(' ')[1]; sl = row[1].strip()[1:-1]; tl = row[3][1:-1]; tl_lema = tl.split('<')[0].lower(); tl_tags = ''.join(tl.split('<')[1:]).replace('>', '.').rstrip('.') freq = 1 # freq = float(row[4]); pattern = common.tokenize_tagger_line(row[2]); if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0: #{ print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr); continue; #} if tipus == '-' or tipus == '~': #{ print('DEFAULT_READING', line, file=sys.stderr); continue; #} # Hacks # if len(pattern) == 0: #{ # print('ZERO_PATTERN' , line, file=sys.stderr); # continue;
pt_line = phrase_table.readline().strip(); bt_line = biltrans_out.readline().strip(); if bt_line == '' and pt_line == '': #{ reading = False; #} if not ambiguous(bt_line): #{ # print(lineno, ' not ambiguous.', file=sys.stderr); continue; #} row = pt_line.split('|||'); print (common.tokenize_tagger_line(row[0])); bt = bt_line.split(); sl = row[1].strip(); tl = row[0].strip(); aliniaments = row[2].strip(); bt_row = bt_line.split(' '); sl_row = sl.split(' '); tl_row = tl.split(' '); if len(sl_row) < 2 and len(tl_row) < 2: #{ continue; #} # Check that the number of words in the lexical transfer, and in the phrasetable matches up if len(sl_row) != len(bt_row): #{
# File with tagger output reading = True lineno = 0 while reading: #{ lineno += 1 if (lineno % 1000 == 0): print("at line no: ", lineno, file=sys.stderr) am_line = am_file.readline() dm_line = dm_file.readline() if am_line == '': #{ reading = False continue #} try: am_row = common.tokenize_biltrans_line(am_line) dm_row = set(common.tokenize_tagger_line(dm_line)) except: continue cur_sl_row = [x['sl'] for x in am_row] for i in range(0, len(am_row)): #{ sl = am_row[i]['sl'] tls = am_row[i]['tls'] if len(tls) < 2: #{ continue #} for tl in found_tls(tls, dm_row): #{ for j in range(1, MAX_NGRAMS): #{ pregram = ' '.join(cur_sl_row[i - j:i + 1])
def ngrams_to_rules(ngrams): # FREQMIN = 8.0 MINMATCH = 2 permitted_tags = ['n', 'vblex', 'adj'] print('<rules>') lineno = 1 ruleno = 0 with open(ngrams) as infile: for line in infile: # print '\n' # print line if len(line) < 2: continue line = line.strip() # line = line.decode('utf-8').strip() print(line, file=sys.stderr) # + 0.571428571429 14 8 8 troiñ<vblex> tourner<vblex> 8 row = line.split('\t') tipus = row[0].split(' ')[0] weight = row[0].replace(' ', ' ').split(' ')[1] sl = row[1].strip()[1:-1] tl = row[3][1:-1] tl_lema = tl.split('<')[0].lower() tl_tags = ''.join(tl.split('<')[1:]).replace( '>', '.').rstrip('.') freq = 1 # freq = float(row[4]) pattern = common.tokenize_tagger_line(row[2]) if row[2].count('<guio>') > 0 or row[2].count('<sent>') > 0 or row[2].count('<cm>') > 0: print('PUNCTUATION_IN_PATTERN', line, file=sys.stderr) continue if tipus == '-' or tipus == '~': print('DEFAULT_READING', line, file=sys.stderr) continue # Hacks # if len(pattern) == 0: # print('ZERO_PATTERN' , line, file=sys.stderr); # continue if len(pattern) < MINMATCH and len(pattern) > 0: print('BELOW_MINMATCH', line, file=sys.stderr) continue inpattern = False for w in pattern: if w.lower().count(sl) > 0: inpattern = True if len(pattern) > 0 and not inpattern: print('SL_NOT_IN_PATTERN', line, file=sys.stderr) continue if tl_tags.count('adj') > 0 and sl.count('adj') < 1: print("TAG_MISMATCH", line, file=sys.stderr) continue if tl_tags.count('vbmod') > 0 and sl.count('vbmod') < 1: print("TAG_MISMATCH", line, file=sys.stderr) continue if tl_tags.split('.')[0] not in permitted_tags: print("TAG_NOT_PERMITTED", tl_tags, '||', line, file=sys.stderr) continue sel = False ruleno = ruleno + 1 lineno = lineno + 1 commentb = '' commente = '' # if freq < FREQMIN: # commentb = '<!--' # commente = '-->' print(commentb + ' <rule c="' + str(ruleno) + ' ' + str(lineno) + ': ' + str(freq) + '" weight="' + weight + '">') for word in pattern: sl_lema = word.split('<')[0].lower() if word.count('><') > 0: sl_tags = '<'.join(word.split('<')[1:]).replace( '><', '.').replace('>', '') else: sl_tags = '<'.join(word.split('<')[1:]).strip('<>') # ======================================================================= # sl_lema = sl_lema.replace('~', ' ') tl_lema = tl_lema.replace('~', ' ') # sl_lema = sl_lema.replace('-', '\-') # tl_lema = tl_lema.replace('-', '\-') # sl_lema = sl_lema.replace('(', '\(') # tl_lema = tl_lema.replace('(', '\(') # sl_lema = sl_lema.replace(')', '\)') # tl_lema = tl_lema.replace(')', '\)') # if word.lower().count(sl) > 0: lineno = lineno + 1 if sl_lema == '': print(' <match tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>') else: print(' <match lemma="' + sl_lema + '" tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>') sel = True else: lineno = lineno + 1 if sl_lema == '': print(' <match tags="' + sl_tags + '"/>') else: print(' <match lemma="' + sl_lema + '" tags="' + sl_tags + '"/>') if sel == False and len(pattern) == 0: sl_lema = sl.split('<')[0] if sl.count('><') > 0: sl_tags = '<'.join(sl.split('<')[1:]).replace( '><', '.').replace('>', '') else: sl_tags = '<'.join(sl.split('<')[1:]).strip('<>') if sl_lema == '': print(' <match tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>') else: print(' <match lemma="' + sl_lema + '" tags="' + sl_tags + '"><select lemma="' + tl_lema + '" tags="' + tl_tags + '"/></match>') print(' </rule>' + commente) elif sel == False: print(' </rule>'+commente + '<!-- Warning: No select operation ', line, '-->') else: print(' </rule>' + commente) lineno = lineno + 1 print('</rules>')