def predict_entities(self, collection):

        next_id = 0
        for instance_keyphrase, label in self.keyphrases.items():
            for sentence in collection.sentences:
                text = sentence.text.lower()
                pattern = r'\b' + instance_keyphrase + r'\b'
                for match in re.finditer(pattern, text):
                    keyphrase = Keyphrase(sentence, label, next_id, [match.span()])
                    keyphrase.split()
                    next_id += 1

                    sentence.keyphrases.append(keyphrase)
Example #2
0
    def load_keyphrases(cls, collection: Collection, finput: Path):
        cls.load_input(collection, finput)

        input_a_file = finput.parent / ("output_a_" + finput.name.split("_")[1])

        sentences_length = [len(s.text) for s in collection.sentences]
        for i in range(1, len(sentences_length)):
            sentences_length[i] += sentences_length[i - 1] + 1

        sentence_by_id = {}

        for line in input_a_file.open(encoding="utf8").readlines():
            lid, spans, label, _ = line.strip().split("\t")
            lid = int(lid)

            spans = [s.split() for s in spans.split(";")]
            spans = [(int(start), int(end)) for start, end in spans]

            # find the sentence where this annotation is
            i = bisect.bisect(sentences_length, spans[0][0])
            # correct the annotation spans
            if i > 0:
                spans = [
                    (
                        start - sentences_length[i - 1] - 1,
                        end - sentences_length[i - 1] - 1,
                    )
                    for start, end in spans
                ]
                spans.sort(key=lambda t: t[0])
            # store the annotation in the corresponding sentence
            the_sentence = collection.sentences[i]
            keyphrase = Keyphrase(the_sentence, label, lid, spans)
            the_sentence.keyphrases.append(keyphrase)

            if len(keyphrase.spans) == 1:
                keyphrase.split()

            sentence_by_id[lid] = the_sentence

        return sentence_by_id
Example #3
0
    def run(self, collection, taskA, taskB):
        gold_keyphrases, gold_relations = self.model

        if taskA:
            next_id = 0
            for gold_keyphrase, label in gold_keyphrases.items():
                for sentence in collection.sentences:
                    text = sentence.text.lower()
                    pattern = r"\b" + gold_keyphrase + r"\b"
                    for match in re.finditer(pattern, text):
                        keyphrase = Keyphrase(sentence, label, next_id,
                                              [match.span()])
                        keyphrase.split()
                        next_id += 1

                        sentence.keyphrases.append(keyphrase)

        if taskB:
            for sentence in collection.sentences:
                for origin in sentence.keyphrases:
                    origin_text = origin.text.lower()
                    for destination in sentence.keyphrases:
                        destination_text = destination.text.lower()
                        try:
                            label = gold_relations[origin_text, origin.label,
                                                   destination_text,
                                                   destination.label, ]
                        except KeyError:
                            continue
                        relation = Relation(sentence, origin.id,
                                            destination.id, label)
                        sentence.relations.append(relation)

                sentence.remove_dup_relations()

        return collection