def next_dm_line(self): self.dm_linenum += 1 self.dm_line = self.dm_file.readline() if not self.dm_line: self.dm_id, self.dm_row = None, [] self.reading = False return ls = self.dm_line.split('\t') if self.line_ids: self.dm_id = int(self.dm_line.split('.[][')[1].split()[0]) if self.tokenizer == 'regex': self.dm_row = self.lu_sep.split(ls[1].strip()[1:-1]) elif self.tokenizer == 'biltrans': self.dm_row = common.tokenize_biltrans_line(self.dm_line)
continue; #} current_am_line_id = int(am_line.split("\t")[0]); # # to skip lines in the frac corpus if we have a sub-corpus if current_dm_line_id != current_am_line_id: #{ print('line_id_mismatch: %d != %d' % (current_am_line_id, current_dm_line_id), file=sys.stderr); # while current_dm_line_id != current_am_line_id: #{ # dm_line = dm_file.readline(); # current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); # print('skipping %d ...' % (current_dm_line_id), file=sys.stderr); # #} #} while current_dm_line_id == current_am_line_id: #{ am_row = common.tokenize_biltrans_line(am_line); dm_row = common.tokenize_biltrans_line(dm_line); if len(am_row) != len(dm_row): #{ amc = len(am_row); dmc = len(dm_row); print('Mismatch in number of LUs between analysis and training', file=sys.stderr); print('am(',amc,'):\t' + am_line, file=sys.stderr); print('dm(',dmc,'):\t' + dm_line, file=sys.stderr); print('...skipping', file=sys.stderr); dm_line = dm_file.readline(); if dm_line == '': #{ reading = False; break; #} current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]);
while reading: #{ try: lineno = lineno + 1; pt_line = phrase_table.readline().strip(); bt_line = biltrans_out.readline().strip(); if not bt_line.strip() and not pt_line.strip(): #{ reading = False; break #} elif not bt_line.strip() or not pt_line.strip(): continue; row = pt_line.split(' ||| '); bt = common.tokenize_biltrans_line(bt_line); sl = common.tokenize_tagger_line(row[1]); tl = common.tokenize_tagger_line(row[0]); if not ambiguous(bt): #{ print ("line", lineno, "not ambiguous", file=sys.stderr); continue; #} if len(sl) < 2 and len(tl) < 2: #{ continue; #} # Check that the number of words in the lexical transfer, and in the phrasetable matches up if len(sl) != len(bt): #{
i = i + 1; #} cur_line = 0; except: print >>sys.stderr, "error in line", lineno; #print line; continue; #} line = line.split('\t')[1]; if cur_line == 0: #{ cur_sl_row = common.tokenize_tagger_line(line); elif cur_line == 1: #{ cur_bt_row = common.tokenize_biltrans_line(line); elif cur_line == 2: #{ cur_tl_row = common.tokenize_tagger_line(line); elif cur_line == 3: #{ cur_al_row = line.split(' '); #} cur_line = cur_line + 1; #} for sl in sl_tl: #{ newtl = sorted(sl_tl[sl], key=lambda x: sl_tl[sl][x]) newtl.reverse() first = True; for tl in newtl: #{
current_am_line_id = int(am_line.split("\t")[0]) # # to skip lines in the frac corpus if we have a sub-corpus if current_dm_line_id != current_am_line_id: #{ print('line_id_mismatch: %d != %d' % (current_am_line_id, current_dm_line_id), file=sys.stderr) # while current_dm_line_id != current_am_line_id: #{ # dm_line = dm_file.readline(); # current_dm_line_id = int(dm_line.split('.[][')[1].split(' ')[0]); # print('skipping %d ...' % (current_dm_line_id), file=sys.stderr); # #} #} while current_dm_line_id == current_am_line_id: #{ am_row = common.tokenize_biltrans_line(am_line) dm_row = common.tokenize_biltrans_line(dm_line) if len(am_row) != len(dm_row): #{ amc = len(am_row) dmc = len(dm_row) print('Mismatch in number of LUs between analysis and training', file=sys.stderr) print('am(', amc, '):\t' + am_line, file=sys.stderr) print('dm(', dmc, '):\t' + dm_line, file=sys.stderr) print('...skipping', file=sys.stderr) dm_line = dm_file.readline() if dm_line == '': #{ reading = False break #}