コード例 #1
0
def turns_with_final_emoticons(doc, tags):
    """
    Return a tuple of lists.

    Both lists contain the turns in a document that end with the
    pattern EDU emoticon-only-EDU.

    The first (main) list contains those that are not pointed to by any
    relations or schema. The second (warnings only) list contains those
    that have relations or schema pointing to them.

    The reason we distinguish between the two lists is that we don't
    want to touch those in the latter (out of conservatism, the idea
    of removing these from their relations, CDUs seems scary), but we
    want to know about them.
    """
    egraph = EnclosureGraph(doc, tags)
    affected_free_turns = []
    affected_linked_turns = []

    for turn in sorted_turns(doc):
        edus = sorted_first_widest(egraph.inside(turn))

        last_edu = edus[-1]
        if len(edus) > 1 and is_just_emoticon(egraph.inside(last_edu)):
            if has_links(doc, last_edu):
                affected_linked_turns.append(turn)
            else:
                affected_free_turns.append(turn)

    return affected_free_turns, affected_linked_turns
コード例 #2
0
def annotate(txt, annotations, inserts=None):
    """
    Decorate a text with arbitrary bracket symbols, as a visual
    guide to the annotations on that text. For example, in a
    chat corpus, you might use newlines to indicate turn
    boundaries and square brackets for segments.

    Parameters
    ----------
    inserts
        inserts a dictionary from annotation type to pair of
        its opening/closing bracket

    FIXME: this needs to become a standard educe utility,
    maybe as part of the educe.annotation layer?
    """
    inserts = inserts or DEFAULT_INSERTS
    if not annotations:
        return txt

    def is_visible(anno):
        """
        Is this annotation one we intend to display?
        """
        return rough_type(anno) in inserts

    def add_endpoints(endpoints, buf, pos):
        """
        Insert any pending closing annotations (eg. right bracket)
        into the text
        """
        endpoints2 = []
        buf2 = buf
        for pos2, rparen in endpoints:
            if pos == pos2:
                buf2 = buf2 + rparen
            else:
                endpoints2.append((pos2, rparen))
        return endpoints2, buf2

    s_annos = sorted_first_widest(filter(is_visible, annotations))
    endpoints = []
    buf = ""
    for i in range(0, len(txt)):
        char = txt[i]
        endpoints, buf = add_endpoints(endpoints, buf, i)
        while s_annos:
            nxt = s_annos[0]
            span = nxt.text_span()
            if span.char_start == i:
                lparen, rparen = inserts[rough_type(nxt)]
                buf = buf + lparen
                endpoints.insert(0, (span.char_end, rparen))  # lifo
                del s_annos[0]
            else:
                break
        buf = buf + char

    _, buf = add_endpoints(endpoints, buf, len(txt))
    return buf
コード例 #3
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself
    if you're using `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    for k in sorted(corpus, key=educe.stac.id_to_path):
        doc = corpus[k]
        print("========== %s ============" % k)
        print()
        if args.edges:
            dialogues = sorted_first_widest(filter(is_dialogue, doc.units))
            if dialogues:
                d_first = dialogues[0]
                print(annotate_doc(doc, span=d_first.text_span()))
                if len(dialogues) > 1:
                    d_last = dialogues[-1]
                    txt = annotate_doc(doc, span=d_last.text_span())
                    print("...\n")
                    print(txt.encode('utf-8'))
        else:
            print(annotate_doc(doc).encode('utf-8'))
        print()
コード例 #4
0
ファイル: clean_emoticons.py プロジェクト: kowey/educe
def turns_with_final_emoticons(doc, tags):
    """
    Return a tuple of lists.

    Both lists contain the turns in a document that end with the
    pattern EDU emoticon-only-EDU.

    The first (main) list contains those that are not pointed to by any
    relations or schema. The second (warnings only) list contains those
    that have relations or schema pointing to them.

    The reason we distinguish between the two lists is that we don't
    want to touch those in the latter (out of conservatism, the idea
    of removing these from their relations, CDUs seems scary), but we
    want to know about them.
    """
    egraph = EnclosureGraph(doc, tags)
    affected_free_turns = []
    affected_linked_turns = []

    for turn in sorted_turns(doc):
        edus = sorted_first_widest(egraph.inside(turn))

        last_edu = edus[-1]
        if len(edus) > 1 and is_just_emoticon(egraph.inside(last_edu)):
            if has_links(doc, last_edu):
                affected_linked_turns.append(turn)
            else:
                affected_free_turns.append(turn)

    return affected_free_turns, affected_linked_turns
コード例 #5
0
ファイル: annotate.py プロジェクト: eipiplusun/educe
def annotate(txt, annotations, inserts=None):
    """
    Decorate a text with arbitrary bracket symbols, as a visual
    guide to the annotations on that text. For example, in a
    chat corpus, you might use newlines to indicate turn
    boundaries and square brackets for segments.

    Parameters
    ----------
    inserts
        inserts a dictionary from annotation type to pair of
        its opening/closing bracket

    FIXME: this needs to become a standard educe utility,
    maybe as part of the educe.annotation layer?
    """
    inserts = inserts or DEFAULT_INSERTS
    if not annotations:
        return txt

    def is_visible(anno):
        """
        Is this annotation one we intend to display?
        """
        return rough_type(anno) in inserts

    def add_endpoints(endpoints, buf, pos):
        """
        Insert any pending closing annotations (eg. right bracket)
        into the text
        """
        endpoints2 = []
        buf2 = buf
        for pos2, rparen in endpoints:
            if pos == pos2:
                buf2 = buf2 + rparen
            else:
                endpoints2.append((pos2, rparen))
        return endpoints2, buf2

    s_annos = sorted_first_widest(filter(is_visible, annotations))
    endpoints = []
    buf = ""
    for i in range(0, len(txt)):
        char = txt[i]
        endpoints, buf = add_endpoints(endpoints, buf, i)
        while s_annos:
            nxt = s_annos[0]
            span = nxt.text_span()
            if span.char_start == i:
                lparen, rparen = inserts[rough_type(nxt)]
                buf = buf + lparen
                endpoints.insert(0, (span.char_end, rparen))  # lifo
                del s_annos[0]
            else:
                break
        buf = buf + char

    _, buf = add_endpoints(endpoints, buf, len(txt))
    return buf
コード例 #6
0
ファイル: text.py プロジェクト: eipiplusun/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself
    if you're using `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    for k in sorted(corpus, key=educe.stac.id_to_path):
        doc = corpus[k]
        print("========== %s ============" % k)
        print()
        if args.edges:
            dialogues = sorted_first_widest(filter(is_dialogue, doc.units))
            if dialogues:
                d_first = dialogues[0]
                print(annotate_doc(doc, span=d_first.text_span()))
                if len(dialogues) > 1:
                    d_last = dialogues[-1]
                    txt = annotate_doc(doc, span=d_last.text_span())
                    print("...\n")
                    print(txt.encode('utf-8'))
        else:
            print(annotate_doc(doc).encode('utf-8'))
        print()
コード例 #7
0
ファイル: rewrite.py プロジェクト: irit-melodi/educe
def _diff_friendly(annos):
    """
    Return a copy of annotations, tweaked and given some arbitrary canonical
    order. The background here is that you might want to do a visual diff
    between the human authored annotations, and automated modifications you
    requested, but in order for that to work we have to to eliminate spurious
    diffs that would obscure the interesting bits.
    """
    return sorted_first_widest(_sans_modified_by(x) for x in annos)
コード例 #8
0
ファイル: rewrite.py プロジェクト: Sablayrolles/debates
def _diff_friendly(annos):
    """
    Return a copy of annotations, tweaked and given some arbitrary canonical
    order. The background here is that you might want to do a visual diff
    between the human authored annotations, and automated modifications you
    requested, but in order for that to work we have to to eliminate spurious
    diffs that would obscure the interesting bits.
    """
    return sorted_first_widest(_sans_modified_by(x) for x in annos)
コード例 #9
0
ファイル: graph.py プロジェクト: eipiplusun/educe
def search_graph_edus(inputs, k, gra, pred):
    """
    Return a ReportItem for any EDU within the graph for which some
    predicate is true
    """
    doc = inputs.corpus[k]
    contexts = inputs.contexts[k]
    edu_names = {gra.annotation(name):name for name in gra.edus()}
    sorted_edus = sorted_first_widest(edu_names.keys())
    return [UnitItem(doc, contexts, x)
            for x in sorted_edus if pred(gra, contexts, edu_names[x])]
コード例 #10
0
ファイル: annotate.py プロジェクト: eipiplusun/educe
def schema_text(doc, anno):
    """
    (recursive) text preview of a schema and its contents.
    Members are enclosed in square brackets.
    """
    if anno is None:
        return ""
    elif isinstance(anno, Schema):
        snippets = [u"[{}]".format(schema_text(doc, x)) for x in
                    sorted_first_widest(anno.members)]
        return "...".join(snippets)
    else:
        return doc.text(anno.text_span())
コード例 #11
0
def search_graph_edus(inputs, k, gra, pred):
    """
    Return a ReportItem for any EDU within the graph for which some
    predicate is true
    """
    doc = inputs.corpus[k]
    contexts = inputs.contexts[k]
    edu_names = {gra.annotation(name): name for name in gra.edus()}
    sorted_edus = sorted_first_widest(edu_names.keys())
    return [
        UnitItem(doc, contexts, x) for x in sorted_edus
        if pred(gra, contexts, edu_names[x])
    ]
コード例 #12
0
def schema_text(doc, anno):
    """
    (recursive) text preview of a schema and its contents.
    Members are enclosed in square brackets.
    """
    if anno is None:
        return ""
    elif isinstance(anno, Schema):
        snippets = [
            u"[{}]".format(schema_text(doc, x))
            for x in sorted_first_widest(anno.members)
        ]
        return "...".join(snippets)
    else:
        return doc.text(anno.text_span())
コード例 #13
0
ファイル: glozz.py プロジェクト: kowey/educe
def cross_check_against(inputs, key1, stage='unannotated'):
    """
    Compare annotations with their equivalents on a twin document
    in the corpus
    """
    key2 = twin_key(key1, stage)
    try:
        missing = cross_check_units(inputs, key2, key1,
                                    MissingItem.missing_status)
        excess = cross_check_units(inputs, key1, key2,
                                   MissingItem.excess_status)
        mismatches = check_unit_ids(inputs, key1, key2)
        missing_excess = []
        for vals in missing.values():
            missing_excess.extend(vals)
        for vals in excess.values():
            missing_excess.extend(vals)

        return sorted_first_widest(missing_excess), mismatches
    except MissingDocumentException as oops:
        print("ARGH! Can't cross-check ", oops.k, sys.stderr)
        return ({}, {})
コード例 #14
0
ファイル: glozz.py プロジェクト: fbuijs/educe
def cross_check_against(inputs, key1, stage='unannotated'):
    """
    Compare annotations with their equivalents on a twin document
    in the corpus
    """
    key2 = twin_key(key1, stage)
    try:
        missing = cross_check_units(inputs, key2, key1,
                                    MissingItem.missing_status)
        excess = cross_check_units(inputs, key1, key2,
                                   MissingItem.excess_status)
        mismatches = check_unit_ids(inputs, key1, key2)
        missing_excess = []
        for vals in missing.values():
            missing_excess.extend(vals)
        for vals in excess.values():
            missing_excess.extend(vals)

        return sorted_first_widest(missing_excess), mismatches
    except MissingDocumentException as oops:
        print("ARGH! Can't cross-check ", oops.k, sys.stderr)
        return ({}, {})
コード例 #15
0
ファイル: clean_emoticons.py プロジェクト: kowey/educe
def merge_final_emoticons(tcache, turn_spans, doc, tags):
    """
    Given a timestamp cache and some text spans identifying
    turns with final emoticons in them, and a document:

    1. find the specified turns in the document
    2. absorb their emoticon EDUs into the one before it

    This modifies the document and does not return
    anything
    """
    egraph = EnclosureGraph(doc, tags)
    for turn in sorted_turns(doc):
        if turn.text_span() not in turn_spans:
            continue
        edus = sorted_first_widest(egraph.inside(turn))
        assert len(edus) > 1

        stamp = tcache.get(educe.stac.turn_id(turn))
        last_edu = edus[-1]
        penult_edu = edus[-2]
        absorb_emoticon(doc, stamp, penult_edu, last_edu)
        doc.units.remove(last_edu)
コード例 #16
0
def merge_final_emoticons(tcache, turn_spans, doc, tags):
    """
    Given a timestamp cache and some text spans identifying
    turns with final emoticons in them, and a document:

    1. find the specified turns in the document
    2. absorb their emoticon EDUs into the one before it

    This modifies the document and does not return
    anything
    """
    egraph = EnclosureGraph(doc, tags)
    for turn in sorted_turns(doc):
        if turn.text_span() not in turn_spans:
            continue
        edus = sorted_first_widest(egraph.inside(turn))
        assert len(edus) > 1

        stamp = tcache.get(educe.stac.turn_id(turn))
        last_edu = edus[-1]
        penult_edu = edus[-2]
        absorb_emoticon(doc, stamp, penult_edu, last_edu)
        doc.units.remove(last_edu)
コード例 #17
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = defaultdict(list)
    # 2017-03-15 merge emoticon EDUs if *all* (selected?) 'discourse'
    # annotators agree that they are not part of any schema or relation
    discourse_subcorpus = defaultdict(list)
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam].append(k)

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_ks = discourse_subcorpus[fam]

        turns = set()
        warn_turns = set()
        for disc_k in disc_ks:
            doc = corpus[disc_k]
            turns_k, warn_turns_k = turns_with_final_emoticons(
                doc, postags[disc_k])
            turns &= set(turns_k)
            warn_turns |= set(warn_turns_k)
        turns = sorted_first_widest(turns)
        warn_turns = sorted_first_widest(warn_turns)

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append("Skipping %s (and related); no offending emoticons"
                            % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
コード例 #18
0
ファイル: clean_emoticons.py プロジェクト: kowey/educe
def sorted_turns(doc):
    """
    Turn annotations in a document, sorted by text span
    """
    return sorted_first_widest(x for x in doc.units if educe.stac.is_turn(x))
コード例 #19
0
ファイル: clean_emoticons.py プロジェクト: irit-melodi/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = defaultdict(list)
    # 2017-03-15 merge emoticon EDUs if *all* (selected?) 'discourse'
    # annotators agree that they are not part of any schema or relation
    discourse_subcorpus = defaultdict(list)
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam].append(k)

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_ks = discourse_subcorpus[fam]

        turns = set()
        warn_turns = set()
        for disc_k in disc_ks:
            doc = corpus[disc_k]
            turns_k, warn_turns_k = turns_with_final_emoticons(
                doc, postags[disc_k])
            turns &= set(turns_k)
            warn_turns |= set(warn_turns_k)
        turns = sorted_first_widest(turns)
        warn_turns = sorted_first_widest(warn_turns)

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append("Skipping %s (and related); no offending emoticons"
                            % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
コード例 #20
0
def sorted_turns(doc):
    """
    Turn annotations in a document, sorted by text span
    """
    return sorted_first_widest(x for x in doc.units if educe.stac.is_turn(x))