Example #1
0
 def test_cdu_itself_cross(self):
     "naughty: CDU spanning dialogues"
     cdu = FakeCDU('c1', [self.edu1_1, self.edu2_1])
     doc = FakeDocument(self.edus1, [], [cdu])
     contexts = Context.for_edus(doc)
     cp = doc.copies
     self.assertTrue(is_cross_dialogue(contexts)(cp[cdu]))
Example #2
0
 def assertPunctured(self, doc, rel):
     "given rel is indeed a puncture in this graph"
     graph = self.mk_graph(doc)
     contexts = Context.for_edus(doc)
     ids = graph_ids(graph)
     self.assertTrue(is_puncture(graph, contexts, ids[rel.local_id()]),
                     'failed to detect puncture')
Example #3
0
 def assertIntact(self, doc, rel):
     "given rel does not constitute a puncture in this graph"
     graph = self.mk_graph(doc)
     contexts = Context.for_edus(doc)
     ids = graph_ids(graph)
     self.assertFalse(is_puncture(graph, contexts, ids[rel.local_id()]),
                      'unexpected puncture')
Example #4
0
 def test_cdu_itself_cross(self):
     "naughty: CDU spanning dialogues"
     cdu = FakeCDU('c1', [self.edu1_1, self.edu2_1])
     doc = FakeDocument(self.edus1, [], [cdu])
     contexts = Context.for_edus(doc)
     cp = doc.copies
     self.assertTrue(is_cross_dialogue(contexts)(cp[cdu]))
Example #5
0
 def assertPunctured(self, doc, rel):
     "given rel is indeed a puncture in this graph"
     graph = self.mk_graph(doc)
     contexts = Context.for_edus(doc)
     ids = graph_ids(graph)
     self.assertTrue(is_puncture(graph,
                                 contexts,
                                 ids[rel.local_id()]),
                     'failed to detect puncture')
Example #6
0
 def assertIntact(self, doc, rel):
     "given rel does not constitute a puncture in this graph"
     graph = self.mk_graph(doc)
     contexts = Context.for_edus(doc)
     ids = graph_ids(graph)
     self.assertFalse(is_puncture(graph,
                                  contexts,
                                  ids[rel.local_id()]),
                      'unexpected puncture')
Example #7
0
    def test_simple_segment_cross(self):
        "simple cross-dialogue"
        src = self.edu1_1
        tgt = self.edu2_1
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertTrue(is_cross_dialogue(contexts)(cp[rel]))
Example #8
0
    def test_simple_segment_cross(self):
        "simple cross-dialogue"
        src = self.edu1_1
        tgt = self.edu2_1
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertTrue(is_cross_dialogue(contexts)(cp[rel]))
Example #9
0
    def test_cdu_cross(self):
        "naughty: rel to CDU in another dialogue"
        cdu = FakeCDU('c1', [self.edu1_1, self.edu1_2])
        src = cdu
        tgt = self.edu2_1
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [cdu])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertTrue(is_cross_dialogue(contexts)(cp[rel]))
Example #10
0
    def test_cdu_cross(self):
        "naughty: rel to CDU in another dialogue"
        cdu = FakeCDU('c1', [self.edu1_1, self.edu1_2])
        src = cdu
        tgt = self.edu2_1
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [cdu])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertTrue(is_cross_dialogue(contexts)(cp[rel]))
Example #11
0
    def test_innocent_cdu(self):
        "innocent: CDU entirely in dialogue"
        cdu = FakeCDU('c1', [self.edu1_1, self.edu1_2])
        src = cdu
        tgt = self.edu1_3
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [cdu])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertFalse(is_cross_dialogue(contexts)(cp[rel]))
        self.assertFalse(is_cross_dialogue(contexts)(cp[cdu]))
Example #12
0
    def test_innocent(self):
        "no squawking on in-dialogue relation"
        src = self.edu1_1
        tgt = self.edu1_2
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertTrue(stac.is_edu(cp[src]))
        self.assertTrue(stac.is_edu(cp[rel].source))
        self.assertFalse(is_cross_dialogue(contexts)(cp[rel]))
Example #13
0
    def test_innocent_cdu(self):
        "innocent: CDU entirely in dialogue"
        cdu = FakeCDU('c1', [self.edu1_1, self.edu1_2])
        src = cdu
        tgt = self.edu1_3
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [cdu])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertFalse(is_cross_dialogue(contexts)(cp[rel]))
        self.assertFalse(is_cross_dialogue(contexts)(cp[cdu]))
Example #14
0
    def test_innocent(self):
        "no squawking on in-dialogue relation"
        src = self.edu1_1
        tgt = self.edu1_2
        rel = FakeRelInst('r', src, tgt)

        doc = FakeDocument(self.edus1, [rel], [])
        contexts = Context.for_edus(doc)
        cp = doc.copies
        self.assertTrue(stac.is_edu(cp[src]))
        self.assertTrue(stac.is_edu(cp[rel].source))
        self.assertFalse(is_cross_dialogue(contexts)(cp[rel]))
Example #15
0
def get_edus_plus(inputs):
    """Generate edus and extra environmental information for each

    Currently:

    * environment
    * contexts
    * edu
    """
    for env in stac_features.mk_envs(inputs, 'unannotated'):
        doc = env.current.doc
        contexts = Context.for_edus(doc)
        for unit in doc.units:
            if educe.stac.is_edu(unit):
                yield env, contexts, unit
Example #16
0
File: main.py Project: kowey/educe
 def __init_read_corpus(self, is_interesting, corpus_dir):
     """
     Read the corpus specified in our args
     """
     reader = stac.Reader(corpus_dir)
     all_files = reader.files()
     self.anno_files = reader.filter(all_files, is_interesting)
     interesting = self.anno_files.keys()
     for key in interesting:
         ukey = twin_key(key, 'unannotated')
         if ukey in all_files:
             self.anno_files[ukey] = all_files[ukey]
     self.corpus = reader.slurp(self.anno_files, verbose=True)
     self.contexts = {k: Context.for_edus(self.corpus[k])
                      for k in self.corpus}
Example #17
0
def get_edus_plus(inputs):
    """Generate edus and extra environmental information for each

    Currently:

    * environment
    * contexts
    * edu
    """
    for env in stac_features.mk_envs(inputs, 'unannotated'):
        doc = env.current.doc
        contexts = Context.for_edus(doc)
        for unit in doc.units:
            if educe.stac.is_edu(unit):
                yield env, contexts, unit
Example #18
0
 def __init_read_corpus(self, is_interesting, corpus_dir):
     """
     Read the corpus specified in our args
     """
     reader = stac.Reader(corpus_dir)
     all_files = reader.files()
     self.anno_files = reader.filter(all_files, is_interesting)
     interesting = list(self.anno_files)  # or list(self.anno_files.keys())
     for key in interesting:
         ukey = twin_key(key, 'unannotated')
         if ukey in all_files:
             self.anno_files[ukey] = all_files[ukey]
     self.corpus = reader.slurp(self.anno_files, verbose=True)
     self.contexts = {k: Context.for_edus(self.corpus[k])
                      for k in self.corpus}
Example #19
0
    def _last_nodes(self):
        """
        Return the dict of node names to the set of last elements up to
        that node (included)
        """
        nodes = self._nodes
        contexts = Context.for_edus(self._graph.doc)
        doc_speakers = frozenset(ctx.speaker() for ctx in contexts.values())

        current_last = dict()
        last_nodes = dict()
        for node in nodes:
            anno_node = self._graph.annotation(node)
            for speaker in speakers(contexts, anno_node):
                current_last[speaker] = node

            last_nodes[node] = frozenset(current_last[speaker]
                                         for speaker in doc_speakers
                                         if speaker in current_last)

        return last_nodes
Example #20
0
    def _last_nodes(self):
        """
        Return the dict of node names to the set of last elements up to
        that node (included)
        """
        nodes = self._nodes
        contexts = Context.for_edus(self._graph.doc)
        doc_speakers = frozenset(ctx.speaker()
            for ctx in contexts.values())

        current_last = dict()
        last_nodes = dict()
        for node in nodes:
            anno_node = self._graph.annotation(node)
            for speaker in speakers(contexts, anno_node):
                current_last[speaker] = node

            last_nodes[node] = frozenset(current_last[speaker]
                for speaker in doc_speakers
                if speaker in current_last)

        return last_nodes
Example #21
0
def fuse_edus(discourse_doc, unit_doc, postags):
    """Return a copy of the discourse level doc, merging info from both
    the discourse and units stage.

    All EDUs will be converted to higher level EDUs.

    Notes
    -----
    * The discourse stage is primary in that we work by going over what
      EDUs we find in the discourse stage and trying to enhance them
      with information we find on their units-level equivalents.
      Sometimes (rarely but it happens) annotations can go out of synch.
      EDUs missing on the units stage will be silently ignored (we try
      to make do without them).
      EDUs that were introduced on the units stage but not percolated to
      discourse will also be ignored.

    * We rely on annotation ids to match EDUs from both stages; it's up
      to you to ensure that the annotations are really in synch.

    * This does not constitute a full merge of the documents. For a full
      merge, you would have to bring over other annotations such as
      Resources, `Preference`, `Anaphor`, `Several_resources`, taking
      care all the while to ensure there are no timestamp clashes with
      pre-existing annotations (it's unlikely but best be on the safe
      side if you ever find yourself with automatically generated
      annotations, where all bets are off time-stamp wise).

    Parameters
    ----------
    discourse_doc : GlozzDocument
        Document from the "discourse" stage.
    unit_doc : GlozzDocument
        Document from the "units" stage.
    postags : list of Token
        Sequence of educe tokens predicted by the POS tagger for this
        document.

    Returns
    -------
    doc : GlozzDocument
        Deep copy of the discourse_doc with info from the units stage
        merged in.
    """
    doc = copy.deepcopy(discourse_doc)

    # first pass: create the EDU objects
    annos = sorted([x for x in doc.units if is_edu(x)],
                   key=lambda x: x.span)
    replacements = {}
    for anno in annos:
        unit_anno = None if unit_doc is None else twin_from(unit_doc, anno)
        edu = EDU(doc, anno, unit_anno)
        replacements[anno] = edu

    # second pass: rewrite doc so that annotations that correspond
    # to EDUs are replacement by their higher-level equivalents
    edus = []
    for anno in annos:
        edu = replacements[anno]
        edus.append(edu)
        doc.units.remove(anno)
        doc.units.append(edu)
        for rel in doc.relations:
            if rel.source == anno:
                rel.source = edu
            if rel.target == anno:
                rel.target = edu
        for schema in doc.schemas:
            if anno in schema.units:
                schema.units.remove(anno)
                schema.units.append(edu)

    # fourth pass: flesh out the EDUs with contextual info
    # now the EDUs should work as contexts too
    contexts = Context.for_edus(doc, postags=postags)
    for edu in edus:
        edu.fleshout(contexts[edu])
    return doc
Example #22
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {
        anno_type: list()
        for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']
    }

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() + cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() + dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() + turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() + tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in [
                    'paragraph', 'Resource', 'Anaphora', 'Several_resources',
                    'Preference'
            ]:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {
        anno_type: pd.DataFrame(data=row_list)
        for anno_type, row_list in rows.items() if row_list
    }

    return res
Example #23
0
def fuse_edus(discourse_doc, unit_doc, postags):
    """Return a copy of the discourse level doc, merging info
    from both the discourse and units stage.

    All EDUs will be converted to higher level EDUs.

    Notes
    -----
    * The discourse stage is primary in that we work by going over what EDUs
      we find in the discourse stage and trying to enhance them with
      information we find on their units-level equivalents. Sometimes (rarely
      but it happens) annotations can go out of synch.  EDUs missing on the
      units stage will be silently ignored (we try to make do without them).
      EDUs that were introduced on the units stage but not percolated to
      discourse will also be ignored.

    * We rely on annotation ids to match EDUs from both stages; it's up to you
      to ensure that the annotations are really in synch.

    * This does not constitute a full merge of the documents. For a full merge,
      you would have to bring over other annotations such as Resources,
      `Preference`, `Anaphor`, `Several_resources`, taking care all the while
      to ensure there are no timestamp clashes with pre-existing annotations
      (it's unlikely but best be on the safe side if you ever find yourself
      with automatically generated annotations, where all bets are off
      time-stamp wise).
    """
    doc = copy.deepcopy(discourse_doc)

    # first pass: create the EDU objects
    annos = sorted([x for x in doc.units if is_edu(x)], key=lambda x: x.span)
    replacements = {}
    for anno in annos:
        unit_anno = None if unit_doc is None else twin_from(unit_doc, anno)
        edu = EDU(doc, anno, unit_anno)
        replacements[anno] = edu

    # second pass: rewrite doc so that annotations that corresponds
    # to EDUs are replacement by their higher-level equivalents
    edus = []
    for anno in annos:
        edu = replacements[anno]
        edus.append(edu)
        doc.units.remove(anno)
        doc.units.append(edu)
        for rel in doc.relations:
            if rel.source == anno:
                rel.source = edu
            if rel.target == anno:
                rel.target = edu
        for schema in doc.schemas:
            if anno in schema.units:
                schema.units.remove(anno)
                schema.units.append(edu)

    # fourth pass: flesh out the EDUs with contextual info
    # now the EDUs should be work as contexts too
    contexts = Context.for_edus(doc, postags=postags)
    for edu in edus:
        edu.fleshout(contexts[edu])
    return doc
Example #24
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {anno_type: list()
            for anno_type in ['edu', 'turn', 'tstar', 'dialogue',
                              'cdu', 'rel']}

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() +
                           cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() +
                           dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() +
                           turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() +
                           tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in ['paragraph',
                               'Resource', 'Anaphora',
                               'Several_resources', 'Preference']:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {anno_type: pd.DataFrame(data=row_list)
           for anno_type, row_list in rows.items()
           if row_list}

    return res