Beispiel #1
0
def compute_updates(src_doc, tgt_doc, matches):
    """Return updates that would need to be made on the target
    document.

    Given matches between the source and target document, return span
    updates along with any source annotations that do not have an
    equivalent in the target document (the latter may indicate that
    resegmentation has taken place, or that there is some kind of problem)

    Parameters
    ----------
    src_doc : Document
    tgt_doc : Document
    matches : [Match]

    Returns
    -------
    updates: Updates
    """
    res = Updates()

    # case 2 and 5 (to be pruned below)
    res.expected_src_only.extend(src_doc.units)
    res.abnormal_tgt_only.extend(tgt_doc.units)

    # case 1, 2 and 4
    for src, tgt, size in matches:
        tgt_to_src = src - tgt
        res.shift_if_ge[tgt] = tgt_to_src  # case 1 and 2
        src_annos = enclosed(Span(src, src + size), src_doc.units)
        tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units)
        for src_anno in src_annos:
            res.expected_src_only.remove(src_anno)  # prune from case 5
            src_span = src_anno.text_span()
            tgt_equiv = [
                x for x in tgt_annos
                if x.text_span().shift(tgt_to_src) == src_span
            ]
            if not tgt_equiv:  # case 4
                res.abnormal_src_only.append(src_anno)
            for tgt_anno in tgt_equiv:  # prun from case 2
                if tgt_anno in res.abnormal_tgt_only:
                    res.abnormal_tgt_only.remove(tgt_anno)

    return res
Beispiel #2
0
def compute_updates(src_doc, tgt_doc, matches):
    """Return updates that would need to be made on the target
    document.

    Given matches between the source and target document, return span
    updates along with any source annotations that do not have an
    equivalent in the target document (the latter may indicate that
    resegmentation has taken place, or that there is some kind of problem)

    Parameters
    ----------
    src_doc : Document
    tgt_doc : Document
    matches : [Match]

    Returns
    -------
    updates: Updates
    """
    res = Updates()

    # case 2 and 5 (to be pruned below)
    res.expected_src_only.extend(src_doc.units)
    res.abnormal_tgt_only.extend(tgt_doc.units)

    # case 1, 2 and 4
    for src, tgt, size in matches:
        tgt_to_src = src - tgt
        res.shift_if_ge[tgt] = tgt_to_src  # case 1 and 2
        src_annos = enclosed(Span(src, src + size), src_doc.units)
        tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units)
        for src_anno in src_annos:
            res.expected_src_only.remove(src_anno)  # prune from case 5
            src_span = src_anno.text_span()
            tgt_equiv = [x for x in tgt_annos if x.text_span().shift(tgt_to_src) == src_span]
            if not tgt_equiv:  # case 4
                res.abnormal_src_only.append(src_anno)
            for tgt_anno in tgt_equiv:  # prun from case 2
                if tgt_anno in res.abnormal_tgt_only:
                    res.abnormal_tgt_only.remove(tgt_anno)

    return res
Beispiel #3
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args)
    for key in corpus:
        doc = corpus[key]
        dialogues = [x for x in doc.units if educe.stac.is_dialogue(x)]
        edus = [x for x in doc.units if educe.stac.is_edu(x)]
        for anno in dialogues:
            dspan = anno.text_span()
            edus_within = enclosed(dspan, edus)
            cols = [friendly_dialogue_id(key, dspan), anno.local_id(), len(edus_within)]
            print("\t".join(map(str, cols)))
Beispiel #4
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args)
    for key in corpus:
        doc = corpus[key]
        dialogues = [x for x in doc.units if educe.stac.is_dialogue(x)]
        edus = [x for x in doc.units if educe.stac.is_edu(x)]
        for anno in dialogues:
            dspan = anno.text_span()
            edus_within = enclosed(dspan, edus)
            cols = [friendly_dialogue_id(key, dspan),
                    anno.local_id(),
                    len(edus_within)]
            print('\t'.join(map(str, cols)))
Beispiel #5
0
def enclosed_lemmas(span, parses):
    """
    Given a span and a list of parses, return any lemmas that
    are within that span
    """
    return [x.features["lemma"] for x in enclosed(span, parses.tokens)]
Beispiel #6
0
def stretch_match_many(updates, src_doc, tgt_doc, doc_span_src, doc_span_tgt,
                       annos_src, annos_tgt, verbose=0):
    """Compute n-m stretch matches between `annos_src` and `annos_tgt`.

    Parameters
    ----------
    updates : Update
    src_doc : Document
    tgt_doc : Document
    doc_span_src : Span
    doc_span_tgt : Span
    annos_src : list of educe.annotation
        Unmatched annotations in `span_src`.
    annos_tgt : list of educe.annotation
        Unmatched annotations in `span_tgt`.
    verbose : int
        Verbosity level

    Returns
    -------
    res : Update
        Possibly trimmed version of `updates`.
    """
    # unmatched structs in src
    cands_src = enclosed(Span(doc_span_src[0], doc_span_src[1]),
                         annos_src)
    cands_src = sorted(cands_src, key=lambda x: x.span)
    spans_src = [anno.text_span() for anno in cands_src]
    # unmatched structs in tgt
    cands_tgt = enclosed(Span(doc_span_tgt[0], doc_span_tgt[1]),
                         annos_tgt)
    cands_tgt = sorted(cands_tgt, key=lambda x: x.span)
    spans_tgt = [anno.text_span() for anno in cands_tgt]

    if not (spans_src and spans_tgt):
        return updates

    # many to many match between source and target
    seqs_src = find_continuous_seqs(src_doc, spans_src, cands_src)
    seqs_tgt = find_continuous_seqs(tgt_doc, spans_tgt, cands_tgt)

    # TODO if both sequences span the same text (for common turns), use
    # stretched target annotations
    for seq_src, seq_tgt in zip(seqs_src, seqs_tgt):
        seq_spans_src = [spans_src[i] for i in seq_src]
        seq_annos_src = [cands_src[i] for i in seq_src]
        span_seq_src = Span(seq_spans_src[0].char_start,
                            seq_spans_src[-1].char_end)

        seq_spans_tgt = [spans_tgt[i] for i in seq_tgt]
        seq_annos_tgt = [cands_tgt[i] for i in seq_tgt]
        span_seq_tgt = Span(seq_spans_tgt[0].char_start,
                            seq_spans_tgt[-1].char_end)
        # compare (hollowed) text
        txt_src = src_doc.text(span=span_seq_src)
        txt_src = hollow_out_missing_turn_text(
            src_doc, tgt_doc,
            doc_span_src=span_seq_src,
            doc_span_tgt=span_seq_tgt).replace('\t ', '').replace('\t', '')
        txt_tgt = tgt_doc.text(span=span_seq_tgt)

        if txt_tgt.strip() == txt_src.strip():
            if verbose:
                print('Many-to-many stretch match:\n',
                      'source:\n',
                      '\n'.join(str(x) for x in seq_annos_src),
                      '\ntarget:\n',
                      '\n'.join(str(x) for x in seq_annos_tgt))
            updates = update_updates(updates, seq_annos_src, seq_annos_tgt,
                                     verbose=verbose)

    return updates
Beispiel #7
0
def stretch_match(updates, src_doc, tgt_doc, doc_span_src, doc_span_tgt,
                  annos_src, annos_tgt, verbose=0):
    """Compute stretch matches between `annos_src` and `annos_tgt`.

    Parameters
    ----------
    updates : Update
    src_doc : Document
    tgt_doc : Document
    doc_span_src : Span
    doc_span_tgt : Span
    annos_src : list of educe.annotation
        Unmatched annotations in `span_src`.
    annos_tgt : list of educe.annotation
        Unmatched annotations in `span_tgt`.
    verbose : int
        Verbosity level

    Returns
    -------
    res : Update
        Possibly trimmed version of `updates`.
    """
    # unmatched structs in src
    cands_src = enclosed(Span(doc_span_src[0], doc_span_src[1]),
                         annos_src)
    spans_src = [anno.text_span() for anno in cands_src]
    # unmatched structs in tgt
    cands_tgt = enclosed(Span(doc_span_tgt[0], doc_span_tgt[1]),
                         annos_tgt)
    spans_tgt = [anno.text_span() for anno in cands_tgt]

    # {one,many} to one match between source and target
    # FIXME separate matching procedures for:
    # * Turn
    # * paragraph
    # * segment (+ segments with different names in units/ ?)
    for span_tgt, cand_tgt in zip(spans_tgt, cands_tgt):
        # span_tgt cast onto src_doc
        shifted_span_tgt = shift_span(span_tgt, updates)

        # 1-1 match on the exact (translated) same span
        src_equiv = [cand_src for span_src, cand_src
                     in zip(spans_src, cands_src)
                     if span_src == shifted_span_tgt]

        # 1-1 stretch match, based on comparing the text of the turns
        # that are common to source and target
        txt_tgt = tgt_doc.text(span=span_tgt).strip()
        # compute the maximal extension of span_src: include whitespaces
        # immediately before
        lctx_src = [src_doc.text(span=Span(span_src.char_start - 10,
                                           span_src.char_start))
                    for span_src in spans_src]
        lpad_src = [len(x) - len(x.rstrip()) for x in lctx_src]
        # ... and after
        rctx_src = [src_doc.text(span=Span(span_src.char_end,
                                           span_src.char_end + 10))
                    for span_src in spans_src]
        rpad_src = [len(x) - len(x.lstrip()) for x in rctx_src]
        # create the corresponding extended spans
        ext_spans_src = [Span(span_src.char_start - lpad,
                              span_src.char_end + rpad)
                         for span_src, lpad, rpad
                         in zip(spans_src, lpad_src, rpad_src)]
        src_equiv_stretch = [cand_src for span_src, ext_span_src, cand_src
                             in zip(spans_src, ext_spans_src, cands_src)
                             if ((txt_tgt ==
                                  hollow_out_missing_turn_text(
                                      src_doc, tgt_doc,
                                      doc_span_src=span_src,
                                      doc_span_tgt=span_tgt
                                  ).replace('\t ', '').replace('\t', '').strip()) and
                                 ext_span_src.encloses(shifted_span_tgt))]
        # extend list of 1-1 exact matches with 1-1 stretch matches
        if src_equiv_stretch:
            src_equiv.extend(src_equiv_stretch)
            if verbose:
                print('1-to-1 stretch match: ',
                      [str(x) for x in src_equiv_stretch])
                print('for target annotation: ', cand_tgt)

        if src_equiv:
            updates = update_updates(updates, src_equiv, [cand_tgt],
                                     verbose=verbose)
        else:
            # many to 1 match between source and target
            #
            # search for a sequence of contiguous annotations in source
            # that covers the same span as a single annotation of the
            # same type in target ; this is supposed to capture the
            # result of `stac-edit merge-{dialogue,edu}`
            src_equiv_cands = enclosed(shifted_span_tgt, cands_src)
            src_equiv_seq = sorted(src_equiv_cands, key=lambda x: x.span)
            # if the sequence covers the targeted span
            if ((src_equiv_seq and
                 src_equiv_seq[0].span.char_start == shifted_span_tgt.char_start and
                 src_equiv_seq[-1].span.char_end == shifted_span_tgt.char_end)):
                # and has no gap or just whitespaces
                gap_str = ''.join(
                    src_doc.text(span=Span(elt_cur.span.char_end,
                                           elt_nex.span.char_start))
                    for elt_cur, elt_nex
                    in zip(src_equiv_seq[:-1], src_equiv_seq[1:])
                )
                gap_str = gap_str.strip()
                if not gap_str:
                    updates = update_updates(
                        updates, src_equiv_seq, [cand_tgt],
                        verbose=verbose)
                    if verbose:
                        print('Guess: {} results from a merge on {}'.format(
                            str(cand_tgt), [str(x) for x in src_equiv_seq]),
                              file=sys.stderr)

    shifted_spans_tgt = [shift_span(span_tgt, updates)
                         for span_tgt in spans_tgt]  # WIP
    # one to many match between source and target
    for span_src, cand_src in zip(spans_src, cands_src):
        # search for a sequence of contiguous annotations in target
        # that covers the same span as a single annotation of the
        # same type in source ; this is supposed to capture the
        # result of `stac-edit split-{dialogue,edu}`
        tgt_equiv_cands = [(shifted_span_tgt, cand_tgt)
                           for shifted_span_tgt, cand_tgt
                           in zip(shifted_spans_tgt, cands_tgt)
                           if span_src.encloses(shifted_span_tgt)]

        tgt_equiv_seq = sorted(tgt_equiv_cands)
        # if the sequence covers the source span
        if ((tgt_equiv_seq and
             tgt_equiv_seq[0][0].char_start == span_src.char_start and
             tgt_equiv_seq[-1][0].char_end == span_src.char_end)):
            # and has no gap or just whitespaces
            gap_str = ''.join(
                tgt_doc.text(span=Span(elt_cur[1].span.char_end,
                                       elt_nex[1].span.char_start))
                for elt_cur, elt_nex
                in zip(tgt_equiv_seq[:-1], tgt_equiv_seq[1:])
            )
            gap_str = gap_str.strip()
            if not gap_str:
                updates = update_updates(
                    updates, [cand_src], [x[1] for x in tgt_equiv_seq],
                    verbose=verbose
                )
                if verbose:
                    print('Guess: {} results from a split on {}'.format(
                        [str(x[1]) for x in tgt_equiv_seq], str(cand_src)),
                          file=sys.stderr)

    return updates
Beispiel #8
0
def compute_updates(src_doc, tgt_doc, matches):
    """Return updates that would need to be made on the target
    document.

    Given matches between the source and target document, return span
    updates along with any source annotations that do not have an
    equivalent in the target document (the latter may indicate that
    resegmentation has taken place, or that there is some kind of problem)

    Parameters
    ----------
    src_doc : Document
    tgt_doc : Document
    matches : [Match]

    Returns
    -------
    updates: Updates
    """
    res = Updates()

    # case 2 and 5 (to be pruned below)
    res.expected_src_only.extend(src_doc.units)
    res.abnormal_tgt_only.extend(tgt_doc.units)

    # WIP separate matching procedures for EDUs, turns, paragraphs,
    # dialogues and the rest (seemingly only resources).
    def is_various(annotation):
        """None of {edu, turn, paragraph, dialogue}.

        It seems to capture only Resources (to be confirmed).
        """
        return not(is_edu(annotation) or
                   is_turn(annotation) or
                   is_paragraph(annotation) or
                   is_dialogue(annotation))

    # case 1, 2 and 4
    for src, tgt, size in matches:
        tgt_to_src = src - tgt
        res.shift_if_ge[tgt] = tgt_to_src  # case 1 and 2
        src_annos = enclosed(Span(src, src + size), src_doc.units)
        tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units)
        # WIP separate matching procedures for the different types of
        # annotations
        for anno_type in [is_edu, is_turn, is_paragraph, is_dialogue,
                          is_various]:
            cands_src = [x for x in src_annos if anno_type(x)]
            cands_tgt = [x for x in tgt_annos if anno_type(x)]
            # compute (shifted) spans once
            spans_src = [x.text_span() for x in cands_src]
            spans_tgt = [x.text_span().shift(tgt_to_src) for x in cands_tgt]
            # loop over source annotations
            for src_span, src_anno in zip(spans_src, cands_src):
                res.expected_src_only.remove(src_anno)  # prune from case 5
                tgt_equiv = [tgt_anno for tgt_span, tgt_anno
                             in zip(spans_tgt, cands_tgt)
                             if tgt_span == src_span]
                if not tgt_equiv:  # case 4
                    res.abnormal_src_only.append(src_anno)
                for tgt_anno in tgt_equiv:  # prune from case 2
                    if tgt_anno in res.abnormal_tgt_only:
                        res.abnormal_tgt_only.remove(tgt_anno)

    return res
Beispiel #9
0
def stretch_match_many(updates,
                       src_doc,
                       tgt_doc,
                       doc_span_src,
                       doc_span_tgt,
                       annos_src,
                       annos_tgt,
                       verbose=0):
    """Compute n-m stretch matches between `annos_src` and `annos_tgt`.

    Parameters
    ----------
    updates : Update
    src_doc : Document
    tgt_doc : Document
    doc_span_src : Span
    doc_span_tgt : Span
    annos_src : list of educe.annotation
        Unmatched annotations in `span_src`.
    annos_tgt : list of educe.annotation
        Unmatched annotations in `span_tgt`.
    verbose : int
        Verbosity level

    Returns
    -------
    res : Update
        Possibly trimmed version of `updates`.
    """
    # unmatched structs in src
    cands_src = enclosed(Span(doc_span_src[0], doc_span_src[1]), annos_src)
    cands_src = sorted(cands_src, key=lambda x: x.span)
    spans_src = [anno.text_span() for anno in cands_src]
    # unmatched structs in tgt
    cands_tgt = enclosed(Span(doc_span_tgt[0], doc_span_tgt[1]), annos_tgt)
    cands_tgt = sorted(cands_tgt, key=lambda x: x.span)
    spans_tgt = [anno.text_span() for anno in cands_tgt]

    if not (spans_src and spans_tgt):
        return updates

    # many to many match between source and target
    seqs_src = find_continuous_seqs(src_doc, spans_src, cands_src)
    seqs_tgt = find_continuous_seqs(tgt_doc, spans_tgt, cands_tgt)

    # TODO if both sequences span the same text (for common turns), use
    # stretched target annotations
    for seq_src, seq_tgt in zip(seqs_src, seqs_tgt):
        seq_spans_src = [spans_src[i] for i in seq_src]
        seq_annos_src = [cands_src[i] for i in seq_src]
        span_seq_src = Span(seq_spans_src[0].char_start,
                            seq_spans_src[-1].char_end)

        seq_spans_tgt = [spans_tgt[i] for i in seq_tgt]
        seq_annos_tgt = [cands_tgt[i] for i in seq_tgt]
        span_seq_tgt = Span(seq_spans_tgt[0].char_start,
                            seq_spans_tgt[-1].char_end)
        # compare (hollowed) text
        txt_src = src_doc.text(span=span_seq_src)
        txt_src = hollow_out_missing_turn_text(
            src_doc,
            tgt_doc,
            doc_span_src=span_seq_src,
            doc_span_tgt=span_seq_tgt).replace('\t ', '').replace('\t', '')
        txt_tgt = tgt_doc.text(span=span_seq_tgt)

        if txt_tgt.strip() == txt_src.strip():
            if verbose:
                print('Many-to-many stretch match:\n', 'source:\n',
                      '\n'.join(str(x) for x in seq_annos_src), '\ntarget:\n',
                      '\n'.join(str(x) for x in seq_annos_tgt))
            updates = update_updates(updates,
                                     seq_annos_src,
                                     seq_annos_tgt,
                                     verbose=verbose)

    return updates
Beispiel #10
0
def stretch_match(updates,
                  src_doc,
                  tgt_doc,
                  doc_span_src,
                  doc_span_tgt,
                  annos_src,
                  annos_tgt,
                  verbose=0):
    """Compute stretch matches between `annos_src` and `annos_tgt`.

    Parameters
    ----------
    updates : Update
    src_doc : Document
    tgt_doc : Document
    doc_span_src : Span
    doc_span_tgt : Span
    annos_src : list of educe.annotation
        Unmatched annotations in `span_src`.
    annos_tgt : list of educe.annotation
        Unmatched annotations in `span_tgt`.
    verbose : int
        Verbosity level

    Returns
    -------
    res : Update
        Possibly trimmed version of `updates`.
    """
    # unmatched structs in src
    cands_src = enclosed(Span(doc_span_src[0], doc_span_src[1]), annos_src)
    spans_src = [anno.text_span() for anno in cands_src]
    # unmatched structs in tgt
    cands_tgt = enclosed(Span(doc_span_tgt[0], doc_span_tgt[1]), annos_tgt)
    spans_tgt = [anno.text_span() for anno in cands_tgt]

    # {one,many} to one match between source and target
    # FIXME separate matching procedures for:
    # * Turn
    # * paragraph
    # * segment (+ segments with different names in units/ ?)
    for span_tgt, cand_tgt in zip(spans_tgt, cands_tgt):
        # span_tgt cast onto src_doc
        shifted_span_tgt = shift_span(span_tgt, updates)

        # 1-1 match on the exact (translated) same span
        src_equiv = [
            cand_src for span_src, cand_src in zip(spans_src, cands_src)
            if span_src == shifted_span_tgt
        ]

        # 1-1 stretch match, based on comparing the text of the turns
        # that are common to source and target
        txt_tgt = tgt_doc.text(span=span_tgt).strip()
        # compute the maximal extension of span_src: include whitespaces
        # immediately before
        lctx_src = [
            src_doc.text(span=Span(span_src.char_start -
                                   10, span_src.char_start))
            for span_src in spans_src
        ]
        lpad_src = [len(x) - len(x.rstrip()) for x in lctx_src]
        # ... and after
        rctx_src = [
            src_doc.text(span=Span(span_src.char_end, span_src.char_end + 10))
            for span_src in spans_src
        ]
        rpad_src = [len(x) - len(x.lstrip()) for x in rctx_src]
        # create the corresponding extended spans
        ext_spans_src = [
            Span(span_src.char_start - lpad, span_src.char_end + rpad)
            for span_src, lpad, rpad in zip(spans_src, lpad_src, rpad_src)
        ]
        src_equiv_stretch = [
            cand_src for span_src, ext_span_src, cand_src in zip(
                spans_src, ext_spans_src, cands_src)
            if ((txt_tgt == hollow_out_missing_turn_text(
                src_doc, tgt_doc, doc_span_src=span_src, doc_span_tgt=span_tgt
            ).replace('\t ', '').replace('\t', '').strip())
                and ext_span_src.encloses(shifted_span_tgt))
        ]
        # extend list of 1-1 exact matches with 1-1 stretch matches
        if src_equiv_stretch:
            src_equiv.extend(src_equiv_stretch)
            if verbose:
                print('1-to-1 stretch match: ',
                      [str(x) for x in src_equiv_stretch])
                print('for target annotation: ', cand_tgt)

        if src_equiv:
            updates = update_updates(updates,
                                     src_equiv, [cand_tgt],
                                     verbose=verbose)
        else:
            # many to 1 match between source and target
            #
            # search for a sequence of contiguous annotations in source
            # that covers the same span as a single annotation of the
            # same type in target ; this is supposed to capture the
            # result of `stac-edit merge-{dialogue,edu}`
            src_equiv_cands = enclosed(shifted_span_tgt, cands_src)
            src_equiv_seq = sorted(src_equiv_cands, key=lambda x: x.span)
            # if the sequence covers the targeted span
            if ((src_equiv_seq and (src_equiv_seq[0].span.char_start
                                    == shifted_span_tgt.char_start)
                 and (src_equiv_seq[-1].span.char_end
                      == shifted_span_tgt.char_end))):
                # and has no gap or just whitespaces
                gap_str = ''.join(
                    src_doc.text(span=Span(elt_cur.span.char_end,
                                           elt_nex.span.char_start))
                    for elt_cur, elt_nex in zip(src_equiv_seq[:-1],
                                                src_equiv_seq[1:]))
                gap_str = gap_str.strip()
                if not gap_str:
                    updates = update_updates(updates,
                                             src_equiv_seq, [cand_tgt],
                                             verbose=verbose)
                    if verbose:
                        print('Guess: {} results from a merge on {}'.format(
                            str(cand_tgt), [str(x) for x in src_equiv_seq]),
                              file=sys.stderr)

    shifted_spans_tgt = [
        shift_span(span_tgt, updates) for span_tgt in spans_tgt
    ]  # WIP
    # one to many match between source and target
    for span_src, cand_src in zip(spans_src, cands_src):
        # search for a sequence of contiguous annotations in target
        # that covers the same span as a single annotation of the
        # same type in source ; this is supposed to capture the
        # result of `stac-edit split-{dialogue,edu}`
        tgt_equiv_cands = [
            (shifted_span_tgt, cand_tgt)
            for shifted_span_tgt, cand_tgt in zip(shifted_spans_tgt, cands_tgt)
            if span_src.encloses(shifted_span_tgt)
        ]

        tgt_equiv_seq = sorted(tgt_equiv_cands)
        # if the sequence covers the source span
        if ((tgt_equiv_seq
             and tgt_equiv_seq[0][0].char_start == span_src.char_start
             and tgt_equiv_seq[-1][0].char_end == span_src.char_end)):
            # and has no gap or just whitespaces
            gap_str = ''.join(
                tgt_doc.text(span=Span(elt_cur[1].span.char_end,
                                       elt_nex[1].span.char_start)) for
                elt_cur, elt_nex in zip(tgt_equiv_seq[:-1], tgt_equiv_seq[1:]))
            gap_str = gap_str.strip()
            if not gap_str:
                updates = update_updates(updates, [cand_src],
                                         [x[1] for x in tgt_equiv_seq],
                                         verbose=verbose)
                if verbose:
                    print('Guess: {} results from a split on {}'.format(
                        [str(x[1]) for x in tgt_equiv_seq], str(cand_src)),
                          file=sys.stderr)

    return updates
Beispiel #11
0
def compute_updates(src_doc, tgt_doc, matches):
    """Return updates that would need to be made on the target
    document.

    Given matches between the source and target document, return span
    updates along with any source annotations that do not have an
    equivalent in the target document (the latter may indicate that
    resegmentation has taken place, or that there is some kind of problem)

    Parameters
    ----------
    src_doc : Document
    tgt_doc : Document
    matches : [Match]

    Returns
    -------
    updates: Updates
    """
    res = Updates()

    # case 2 and 5 (to be pruned below)
    res.expected_src_only.extend(src_doc.units)
    res.abnormal_tgt_only.extend(tgt_doc.units)

    # WIP separate matching procedures for EDUs, turns, paragraphs,
    # dialogues and the rest (seemingly only resources).
    def is_various(annotation):
        """None of {edu, turn, paragraph, dialogue}.

        It seems to capture only Resources (to be confirmed).
        """
        return not (is_edu(annotation) or is_turn(annotation)
                    or is_paragraph(annotation) or is_dialogue(annotation))

    # case 1, 2 and 4
    for src, tgt, size in matches:
        tgt_to_src = src - tgt
        res.shift_if_ge[tgt] = tgt_to_src  # case 1 and 2
        src_annos = enclosed(Span(src, src + size), src_doc.units)
        tgt_annos = enclosed(Span(tgt, tgt + size), tgt_doc.units)
        # WIP separate matching procedures for the different types of
        # annotations
        for anno_type in [
                is_edu, is_turn, is_paragraph, is_dialogue, is_various
        ]:
            cands_src = [x for x in src_annos if anno_type(x)]
            cands_tgt = [x for x in tgt_annos if anno_type(x)]
            # compute (shifted) spans once
            spans_src = [x.text_span() for x in cands_src]
            spans_tgt = [x.text_span().shift(tgt_to_src) for x in cands_tgt]
            # loop over source annotations
            for src_span, src_anno in zip(spans_src, cands_src):
                res.expected_src_only.remove(src_anno)  # prune from case 5
                tgt_equiv = [
                    tgt_anno
                    for tgt_span, tgt_anno in zip(spans_tgt, cands_tgt)
                    if tgt_span == src_span
                ]
                if not tgt_equiv:  # case 4
                    res.abnormal_src_only.append(src_anno)
                for tgt_anno in tgt_equiv:  # prune from case 2
                    if tgt_anno in res.abnormal_tgt_only:
                        res.abnormal_tgt_only.remove(tgt_anno)

    return res
Beispiel #12
0
def enclosed_lemmas(span, parses):
    """
    Given a span and a list of parses, return any lemmas that
    are within that span
    """
    return [x.features["lemma"] for x in enclosed(span, parses.tokens)]