Ejemplo n.º 1
0
    def add_conll_utterance(self,
                            parsed,
                            tokens,
                            corefs,
                            speaker_id,
                            use_gold_mentions,
                            debug=False):
        conll_lookup = self.get_conll_spacy_lookup(tokens, parsed)
        self.conll_tokens.append(tokens)
        self.conll_lookup.append(conll_lookup)
        # Convert conll tokens coref index in spacy tokens indexes
        identified_gold = [False] * len(corefs)
        for coref in corefs:
            missing_values = [
                key for key in [
                    'label',
                    'start',
                    'end',
                ] if coref.get(key, None) is None
            ]
            if missing_values:
                found_values = {
                    key: coref[key]
                    for key in ['label', 'start', 'end']
                    if coref.get(key, None) is not None
                }
                raise Exception(
                    f"Coref {self.name} with fields {found_values} has empty values for the keys {missing_values}."
                )

            coref["start"] = conll_lookup[coref["start"]][0]
            coref["end"] = conll_lookup[coref["end"]][-1]

        if speaker_id not in self.speakers:
            speaker_name = speaker_id.split("_")
            if debug:
                print("New speaker: ", speaker_id, "name: ", speaker_name)
            self.speakers[speaker_id] = Speaker(speaker_id, speaker_name)
        if use_gold_mentions:
            for coref in corefs:
                # print("coref['label']", coref['label'])
                # print("coref text",parsed[coref['start']:coref['end']+1])
                mention = Mention(
                    parsed[coref["start"]:coref["end"] + 1],
                    len(self.mentions),
                    len(self.utterances),
                    self.n_sents,
                    speaker=self.speakers[speaker_id],
                    gold_label=coref["label"],
                )
                self.mentions.append(mention)
                # print("mention: ", mention, "label", mention.gold_label)
        else:
            mentions_spans = extract_mentions_spans(doc=parsed,
                                                    blacklist=self.blacklist)
            self._process_mentions(
                mentions_spans,
                len(self.utterances),
                self.n_sents,
                self.speakers[speaker_id],
            )

            # Assign a gold label to mentions which have one
            if debug:
                print("Check corefs", corefs)
            for i, coref in enumerate(corefs):
                for m in self.mentions:
                    if m.utterance_index != len(self.utterances):
                        continue
                    # if debug: print("Checking mention", m, m.utterance_index, m.start, m.end)
                    if coref["start"] == m.start and coref["end"] == m.end - 1:
                        m.gold_label = coref["label"]
                        identified_gold[i] = True
                        # if debug: print("Gold mention found:", m, coref['label'])
            for found, coref in zip(identified_gold, corefs):
                if not found:
                    self.missed_gold.append([
                        self.name,
                        self.part,
                        str(len(self.utterances)),
                        parsed.text,
                        parsed[coref["start"]:coref["end"] + 1].text,
                    ])
                    if debug:
                        print(
                            "❄️ gold mention not in predicted mentions",
                            coref,
                            parsed[coref["start"]:coref["end"] + 1],
                        )
        self.utterances.append(parsed)
        self.gold_corefs.append(corefs)
        self.utterances_speaker.append(self.speakers[speaker_id])
        self.n_sents += len(list(parsed.sents))
Ejemplo n.º 2
0
    def add_conll_utterance(self,
                            parsed,
                            tokens,
                            corefs,
                            speaker_id,
                            use_gold_mentions,
                            debug=False):
        conll_lookup = self.get_conll_spacy_lookup(tokens, parsed)
        self.conll_tokens.append(tokens)
        self.conll_lookup.append(conll_lookup)
        # Convert conll tokens coref index in spacy tokens indexes
        identified_gold = [False] * len(corefs)
        for coref in corefs:
            assert (coref['label'] is not None and coref['start'] is not None and coref['end'] is not None), \
                ("Error in coreference " + coref + " in " + parsed)
            coref['start'] = conll_lookup[coref['start']][0]
            coref['end'] = conll_lookup[coref['end']][-1]

        if speaker_id not in self.speakers:
            speaker_name = speaker_id.split(u'_')
            if debug:
                print("New speaker: ", speaker_id, "name: ", speaker_name)
            self.speakers[speaker_id] = Speaker(speaker_id, speaker_name)
        if use_gold_mentions:
            for coref in corefs:
                # print("coref['label']", coref['label'])
                # print("coref text",parsed[coref['start']:coref['end']+1])
                mention = Mention(parsed[coref['start']:coref['end'] + 1],
                                  len(self.mentions),
                                  len(self.utterances),
                                  self.n_sents,
                                  speaker=self.speakers[speaker_id],
                                  gold_label=coref['label'])
                self.mentions.append(mention)
                # print("mention: ", mention, "label", mention.gold_label)
        else:
            mentions_spans = extract_mentions_spans(doc=parsed,
                                                    blacklist=self.blacklist)
            self._process_mentions(mentions_spans, len(self.utterances),
                                   self.n_sents, self.speakers[speaker_id])

            # Assign a gold label to mentions which have one
            if debug: print("Check corefs", corefs)
            for i, coref in enumerate(corefs):
                for m in self.mentions:
                    if m.utterance_index != len(self.utterances):
                        continue
                    # if debug: print("Checking mention", m, m.utterance_index, m.start, m.end)
                    if coref['start'] == m.start and coref['end'] == m.end - 1:
                        m.gold_label = coref['label']
                        identified_gold[i] = True
                        # if debug: print("Gold mention found:", m, coref['label'])
            for found, coref in zip(identified_gold, corefs):
                if not found:
                    self.missed_gold.append([
                        self.name, self.part,
                        str(len(self.utterances)), parsed.text,
                        parsed[coref['start']:coref['end'] + 1].text
                    ])
                    if debug:
                        print("❄️ gold mention not in predicted mentions",
                              coref, parsed[coref['start']:coref['end'] + 1])
        self.utterances.append(parsed)
        self.gold_corefs.append(corefs)
        self.utterances_speaker.append(self.speakers[speaker_id])
        self.n_sents += len(list(parsed.sents))