Esempio n. 1
0
def absorb_emoticon(doc, stamp, penult, last):
    """
    Given a timestamp, and two edus, @penult@ (the second to last edu
    in a turn annotation), and @last@ (an emoticon-only edu that follows it),
    absorb the latter into the former.

    This only mutates `penult` (and updates the timestamp generator), and
    does not return anything

    Note that we also have to update any relations/schemas in the document
    telling them to point to the annotation with the new id
    """
    old_id = penult.local_id()
    penult.span = penult.text_span().merge(last.text_span())
    set_anno_date(penult, stamp)
    set_anno_author(penult, "stacutil")
    retarget(doc, old_id, penult)
Esempio n. 2
0
def absorb_emoticon(doc, stamp, penult, last):
    """
    Given a timestamp, and two edus, @penult@ (the second to last edu
    in a turn annotation), and @last@ (an emoticon-only edu that follows it),
    absorb the latter into the former.

    This only mutates `penult` (and updates the timestamp generator), and
    does not return anything

    Note that we also have to update any relations/schemas in the document
    telling them to point to the annotation with the new id
    """
    old_id = penult.local_id()
    penult.span = penult.text_span().merge(last.text_span())
    set_anno_date(penult, stamp)
    set_anno_author(penult, "stacutil")
    retarget(doc, old_id, penult)
Esempio n. 3
0
def _actually_merge(tcache, edus, doc):
    """
    Given a timestamp cache, a document and a collection of edus,
    replace the edus with a single merged edu in the document

    Anything that points to one of the EDUs should point
    instead to the new edu.

    Anything which points exclusively to EDUs in the span
    should be deleted (or signaled?)

    Annotations and features should be merged
    """

    def one_or_join(strs):
        "Return element if singleton, otherwise moosh together"
        strs = [x for x in strs if x is not None]
        return list(strs)[0] if len(strs) == 1\
            else _MERGE_PREFIX + "/".join(strs)

    if not edus:
        return
    new_edu = copy.deepcopy(edus[0])
    new_edu.span = Span.merge_all(x.text_span() for x in edus)
    stamp = tcache.get(new_edu.span)
    set_anno_date(new_edu, stamp)
    set_anno_author(new_edu, _AUTHOR)

    if doc.origin.stage == 'units':
        new_edu.type = one_or_join(frozenset(x.type for x in edus))
        # feature keys for all edus
        all_keys = frozenset(x for edu in edus for x in edu.features.keys())
        for key in all_keys:
            old_values = frozenset(x.features.get(key) for x in edus)
            new_edu.features[key] = one_or_join(old_values)

    # in-place replacement
    for i, _ in enumerate(doc.units):
        if doc.units[i] in edus:
            doc.units[i] = new_edu
            break

    for edu in edus:
        if edu in doc.units:
            doc.units.remove(edu)
        retarget(doc, edu.local_id(), new_edu)
Esempio n. 4
0
def _actually_merge(tcache, edus, doc):
    """
    Given a timestamp cache, a document and a collection of edus,
    replace the edus with a single merged edu in the document

    Anything that points to one of the EDUs should point
    instead to the new edu.

    Anything which points exclusively to EDUs in the span
    should be deleted (or signaled?)

    Annotations and features should be merged
    """
    def one_or_join(strs):
        "Return element if singleton, otherwise moosh together"
        strs = [x for x in strs if x is not None]
        return list(strs)[0] if len(strs) == 1\
            else _MERGE_PREFIX + "/".join(strs)

    if not edus:
        return
    new_edu = copy.deepcopy(edus[0])
    new_edu.span = Span.merge_all(x.text_span() for x in edus)
    stamp = tcache.get(new_edu.span)
    set_anno_date(new_edu, stamp)
    set_anno_author(new_edu, _AUTHOR)

    if doc.origin.stage == 'units':
        new_edu.type = one_or_join(frozenset(x.type for x in edus))
        # feature keys for all edus
        all_keys = frozenset(x for edu in edus for x in edu.features.keys())
        for key in all_keys:
            old_values = frozenset(x.features.get(key) for x in edus)
            new_edu.features[key] = one_or_join(old_values)

    # in-place replacement
    for i, _ in enumerate(doc.units):
        if doc.units[i] in edus:
            doc.units[i] = new_edu
            break

    for edu in edus:
        if edu in doc.units:
            doc.units.remove(edu)
        retarget(doc, edu.local_id(), new_edu)
Esempio n. 5
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit', {},
                                  metadata={
                                      'author': _AUTHOR,
                                      'creation-date': str(cdu_stamp)
                                  })
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)
Esempio n. 6
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit',
                                  {},
                                  metadata={'author': _AUTHOR,
                                            'creation-date': str(cdu_stamp)})
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)