def tercom_scores_unicode(hypotheses, references): """ Returns a list of TERCOM scores """ writer = codecs.getwriter('utf-8') with NamedTemporaryFile('w') as hypothesis_file, NamedTemporaryFile('w') as reference_file: hypothesis_file = writer(hypothesis_file) reference_file = writer(reference_file) for i, (hypothesis, reference) in enumerate(zip(hypotheses, references)): hypothesis_file.write(u'{} ({})\n'.format(hypothesis, i)) reference_file.write(u'{} ({})\n'.format(reference, i)) hypothesis_file.flush() reference_file.flush() filename = mktemp() cmd = ['java', '-jar', 'tercom.jar', '-h', hypothesis_file.name, '-r', reference_file.name, '-o', 'ter', '-n', filename] output = uopen('/dev/null', 'w') call(cmd, stdout=output, stderr=output) with uopen(filename + '.ter') as f: lines = list(f) scores = [float(line.split(' ')[-1]) for line in lines[2:]] os.remove(filename + '.ter') return scores
def extract_from_ibm(filename1, filename2): """ Takes two GIZA++ alignment file (first file is source->target, second file is target->source), and yields trajectories. """ with uopen(filename1) as file1, uopen(filename2) as file2: for line1, line2 in zip(islice(file1, 2, None, 3), islice(file2, 2, None, 3)): pairs1, words1 = get_pairs(line1.strip()) pairs2, words2 = get_pairs(line2.strip()) pairs2 = [pair[::-1] for pair in pairs2] sentence = ' '.join(words1) intersection = list(set(pairs1).intersection(set(pairs2))) ops = [] pairs = intersection[:] pairs.append((len(words1), len(words2))) i = 0 while i < len(words2): x = next((x for x, y in pairs if y == i), None) if not any(s == i for s, _ in pairs): ops.append(('DEL', i)) del words1[i] pairs = [(k - int(i < k), l) for k, l in pairs] elif x == i: if words1[i] != words2[i]: ops.append(('SUB', i, words2[i])) pairs.remove((i, i)) words1[i] = words2[i] i += 1 elif x is not None: # move # TODO: fix ops.append(('MOVE', x, i)) words1.insert(i, words1.pop(x)) pairs = [(i, l) if k == x and l == i else (k + int(i <= k < x), l) for k, l in pairs] else: # insertion ops.append(('INS', i, words2[i])) words1.insert(i, words2[i]) pairs = [(k + int(i <= k), l) for k, l in pairs] i += 1 ops.append(('STOP', )) yield sentence, ops
def read_trajectories(filename): """ A trajectory file contains entries each corresponding to a transition, whose fields are delimited by '|||'. Transitions contain those fields: id, src, s, s', a, score(s), score(s') The first field is the id of this transition's trajectory. """ trajectories = [] with uopen(filename) as f: current_idx = None current_traj = [] for line in f: idx, src, trg1, trg2, action, score1, score2 = line.split( '|||') score1, score2 = float(score1), float(score2) action = action.split() op = action[0] if op in ['SUB', 'INS', 'DEL']: action[1] = int(action[1]) elif op == 'MOVE': action[1:] = map(int, action[1:]) elif op != 'STOP': raise Exception('Unknown action type') action = tuple(action) # a state is a pair (source sentence, translation hypothesis) # source sentence is the same for all transitions in a given trajectory state1 = State(src, trg1) state2 = State(src, trg2) transition = (state1, state2, action, score1, score2) # a different index marks the end of a trajectory if current_idx is not None and idx != current_idx: trajectories.append(current_traj) current_traj = [] current_traj.append(transition) current_idx = idx return trajectories
def read_trajectories(filename): """ A trajectory file contains entries each corresponding to a transition, whose fields are delimited by '|||'. Transitions contain those fields: id, src, s, s', a, score(s), score(s') The first field is the id of this transition's trajectory. """ trajectories = [] with uopen(filename) as f: current_idx = None current_traj = [] for line in f: idx, src, trg1, trg2, action, score1, score2 = line.split('|||') score1, score2 = float(score1), float(score2) action = action.split() op = action[0] if op in ['SUB', 'INS', 'DEL']: action[1] = int(action[1]) elif op == 'MOVE': action[1:] = map(int, action[1:]) elif op != 'STOP': raise Exception('Unknown action type') action = tuple(action) # a state is a pair (source sentence, translation hypothesis) # source sentence is the same for all transitions in a given trajectory state1 = State(src, trg1) state2 = State(src, trg2) transition = (state1, state2, action, score1, score2) # a different index marks the end of a trajectory if current_idx is not None and idx != current_idx: trajectories.append(current_traj) current_traj = [] current_traj.append(transition) current_idx = idx return trajectories
def extract_from_ter(ter_filename, src_filename): """ Yields trajectories (lists of operations): SUB index word DEL index INS index word MOVE index position operations are applied from left to right (indices refer to the current state of the hypothesis) """ with uopen(ter_filename) as f, uopen(src_filename) as src_file: while True: src_sent = next(src_file).strip() ops = [] lines = list(takewhile(lambda line: line.strip(), f)) if not lines: break ref = re.match(r'Original Ref:\s*(.*?)\n', lines[1]).group(1) hyp = re.match(r'Original Hyp:\s*(.*?)\n', lines[2]).group(1) hyp_after_shift = re.match(r'Hyp After Shift:\s*(.*?)\n', lines[3]).group(1) align = re.match(r'Alignment:\s*\((.*?)\)', lines[4]).group(1) numshifts = int(re.match(r'NumShifts: (\d+)', lines[5]).group(1)) regex = re.compile(r'\s*\[(\d+), (\d+), .*?/(.*?)\] \(\[(.*?)\]\)') shifts = [ regex.match(lines[6 + i]).groups() for i in range(numshifts) ] shifts = [(int(i), int(j), int(k), re.sub(r',\s+', ' ', words)) for i, j, k, words in shifts] shift_indices = get_shifts(shifts, hyp.split(), hyp_after_shift.split()) for i, j, k in shift_indices: l = j - i for x in range(l): if k >= i: op = ('MOVE', i, k + l - 1) else: op = ('MOVE', i + x, k + x) ops.append(op) ref_iter = iter(ref.split()) hyp_iter = iter(hyp_after_shift.split()) i = 0 for op in align: # insert and delete are reversed in TERCOM if op != 'D': next(hyp_iter) if op != 'I': inserted = next(ref_iter) if op == 'S': ops.append(('SUB', i, inserted)) elif op == 'D': ops.append(('INS', i, inserted)) elif op == 'I': ops.append(('DEL', i)) i -= 1 i += 1 ops.append(('STOP', )) # try to reconstruct reference state = State(src_sent, hyp) for op in ops: state = state.transition(op) if state.trg != ref: # in some weird and rare cases (likely due to a bug in TERCOM) yield (src_sent, hyp), [ ] # empty trajectory (index is skipped in the output) continue #assert(state.trg == ref) yield (src_sent, hyp), ops