def predict_entities(self, collection): next_id = 0 for instance_keyphrase, label in self.keyphrases.items(): for sentence in collection.sentences: text = sentence.text.lower() pattern = r'\b' + instance_keyphrase + r'\b' for match in re.finditer(pattern, text): keyphrase = Keyphrase(sentence, label, next_id, [match.span()]) keyphrase.split() next_id += 1 sentence.keyphrases.append(keyphrase)
def load_keyphrases(cls, collection: Collection, finput: Path): cls.load_input(collection, finput) input_a_file = finput.parent / ("output_a_" + finput.name.split("_")[1]) sentences_length = [len(s.text) for s in collection.sentences] for i in range(1, len(sentences_length)): sentences_length[i] += sentences_length[i - 1] + 1 sentence_by_id = {} for line in input_a_file.open(encoding="utf8").readlines(): lid, spans, label, _ = line.strip().split("\t") lid = int(lid) spans = [s.split() for s in spans.split(";")] spans = [(int(start), int(end)) for start, end in spans] # find the sentence where this annotation is i = bisect.bisect(sentences_length, spans[0][0]) # correct the annotation spans if i > 0: spans = [ ( start - sentences_length[i - 1] - 1, end - sentences_length[i - 1] - 1, ) for start, end in spans ] spans.sort(key=lambda t: t[0]) # store the annotation in the corresponding sentence the_sentence = collection.sentences[i] keyphrase = Keyphrase(the_sentence, label, lid, spans) the_sentence.keyphrases.append(keyphrase) if len(keyphrase.spans) == 1: keyphrase.split() sentence_by_id[lid] = the_sentence return sentence_by_id
def run(self, collection, taskA, taskB): gold_keyphrases, gold_relations = self.model if taskA: next_id = 0 for gold_keyphrase, label in gold_keyphrases.items(): for sentence in collection.sentences: text = sentence.text.lower() pattern = r"\b" + gold_keyphrase + r"\b" for match in re.finditer(pattern, text): keyphrase = Keyphrase(sentence, label, next_id, [match.span()]) keyphrase.split() next_id += 1 sentence.keyphrases.append(keyphrase) if taskB: for sentence in collection.sentences: for origin in sentence.keyphrases: origin_text = origin.text.lower() for destination in sentence.keyphrases: destination_text = destination.text.lower() try: label = gold_relations[origin_text, origin.label, destination_text, destination.label, ] except KeyError: continue relation = Relation(sentence, origin.id, destination.id, label) sentence.relations.append(relation) sentence.remove_dup_relations() return collection