Beispiel #1
0
def full_trajectories(partial_trajectories):
    for k, ((src, trg), actions) in enumerate(partial_trajectories):
        state = State(src, trg)
        for action in actions:
            new_state = state.transition(action)
            yield k, state.src, state.trg, new_state.trg, action
            state = new_state
Beispiel #2
0
def extract_from_ter(ter_filename, src_filename):
    """
    Yields trajectories (lists of operations):
        SUB index word
        DEL index
        INS index word
        MOVE index position
    operations are applied from left to right (indices refer to the current state of the hypothesis)
    """
    with uopen(ter_filename) as f, uopen(src_filename) as src_file:
        while True:
            src_sent = next(src_file).strip()

            ops = []
            lines = list(takewhile(lambda line: line.strip(), f))
            if not lines:
                break
            ref = re.match(r'Original Ref:\s*(.*?)\n', lines[1]).group(1)
            hyp = re.match(r'Original Hyp:\s*(.*?)\n', lines[2]).group(1)
            hyp_after_shift = re.match(r'Hyp After Shift:\s*(.*?)\n',
                                       lines[3]).group(1)
            align = re.match(r'Alignment:\s*\((.*?)\)', lines[4]).group(1)

            numshifts = int(re.match(r'NumShifts: (\d+)', lines[5]).group(1))
            regex = re.compile(r'\s*\[(\d+), (\d+), .*?/(.*?)\] \(\[(.*?)\]\)')
            shifts = [
                regex.match(lines[6 + i]).groups() for i in range(numshifts)
            ]
            shifts = [(int(i), int(j), int(k), re.sub(r',\s+', ' ', words))
                      for i, j, k, words in shifts]

            shift_indices = get_shifts(shifts, hyp.split(),
                                       hyp_after_shift.split())

            for i, j, k in shift_indices:
                l = j - i
                for x in range(l):
                    if k >= i:
                        op = ('MOVE', i, k + l - 1)
                    else:
                        op = ('MOVE', i + x, k + x)
                    ops.append(op)

            ref_iter = iter(ref.split())
            hyp_iter = iter(hyp_after_shift.split())
            i = 0
            for op in align:
                # insert and delete are reversed in TERCOM
                if op != 'D':
                    next(hyp_iter)
                if op != 'I':
                    inserted = next(ref_iter)

                if op == 'S':
                    ops.append(('SUB', i, inserted))
                elif op == 'D':
                    ops.append(('INS', i, inserted))
                elif op == 'I':
                    ops.append(('DEL', i))
                    i -= 1

                i += 1

            ops.append(('STOP', ))

            # try to reconstruct reference
            state = State(src_sent, hyp)
            for op in ops:
                state = state.transition(op)

            if state.trg != ref:  # in some weird and rare cases (likely due to a bug in TERCOM)
                yield (src_sent, hyp), [
                ]  # empty trajectory (index is skipped in the output)
                continue

            #assert(state.trg == ref)

            yield (src_sent, hyp), ops