def predict_entities(self, collection): next_id = 0 for instance_keyphrase, label in self.keyphrases.items(): for sentence in collection.sentences: text = sentence.text.lower() pattern = r'\b' + instance_keyphrase + r'\b' for match in re.finditer(pattern, text): keyphrase = Keyphrase(sentence, label, next_id, [match.span()]) keyphrase.split() next_id += 1 sentence.keyphrases.append(keyphrase)
def _test_biluov_task(): import es_core_news_md from scripts.utils import Sentence def forward(tokensxsentence, entitiesxsentence): labelsxsentence, _ = to_biluov(tokensxsentence, entitiesxsentence) return [ from_biluov(biluov, sentence, spans=True) for biluov, sentence in zip(labelsxsentence, tokensxsentence) ] training = Collection().load(Path("data/training/scenario.txt")) nlp = es_core_news_md.load() def per_label(label): tokensxsentence = [nlp(s.text) for s in training.sentences] entitiesxsentence = [[ k.spans for k in s.keyphrases if k.label == label ] for s in training.sentences] decoded = forward(tokensxsentence, entitiesxsentence) return decoded collection = Collection([Sentence(s.text) for s in training.sentences]) for label in ENTITIES: decoded = per_label(label) for entities, sentence in zip(decoded, collection.sentences): for spans in entities: keyphrase = Keyphrase(sentence, label, -1, spans) sentence.keyphrases.append(keyphrase) collection.fix_ids() output = Path( "data/submissions/forward-biluov/train/run1/scenario2-taskA/") output.mkdir(parents=True, exist_ok=True) collection.dump(output / "scenario.txt", skip_empty_sentences=False)
def decode_bilou(sentence: Sentence, tags, tokens, spans) -> List[Keyphrase]: """tags: B-Concept, B-Action, ...""" next_id = 0 # unique id tokens = [{ "token": i, "span": j, "label": k } for i, j, k in zip(tokens, spans, tags) if j != (0, 0)] entity_spans = [] entity_label = None prev_state = None prev_label = 'O' # if tokens are the atomic elements, we convert them to word words = [] for token in tokens: if token['token'].startswith('##'): # as ##ma is being convert to asma word = words.pop() s0 = list(word['span'])[0] s1 = list(token['span'])[1] words.append({ "token": word['token'] + token['token'][2:], "span": (s0, s1), "label": word['label'] }) else: words.append(token) list_of_keypfrases = [] for w in words: """IF the new label is BUO, or the new label is different from previous one, or the new label is IL but the previuos one isnt BI THEN save the previous entity and reset """ bool_1 = (w['label'][:1] in ['B', 'U', 'O']) bool_2 = w['label'][2:] != prev_label bool_3 = (w['label'][:1] in ['I', 'L']) and (prev_state not in ['B', 'I']) if bool_1 or bool_2 or bool_3: if entity_spans: keyphrase = Keyphrase(sentence=sentence, label=entity_label, id=next_id, spans=entity_spans) list_of_keypfrases.append(keyphrase) next_id += 1 entity_spans = [] if w['label'] == 'O': entity_label = 'O' else: entity_spans.append(w['span']) entity_label = w['label'][2:] prev_state = w['label'][:1] prev_label = entity_label return list_of_keypfrases
def make_sentence(doc, bilouv, labels) -> Sentence: sentence = Sentence(doc.text) logger.debug(f"[make_sentence]: doc.text={doc.text}") logger.debug(f"[make_sentence]: bilouv={bilouv}") labels = set(l[2:] for l in labels if l != 'O') for label in labels: specific_bilouv = [] for tag in bilouv: if tag.endswith(label): tag = tag[0] specific_bilouv.append(tag[0]) else: specific_bilouv.append('O') logger.debug( f"[make_sentence]: label={label} specific_bilouv={specific_bilouv}" ) spans = from_biluov(specific_bilouv, doc, spans=True) sentence.keyphrases.extend( Keyphrase(sentence, label, i, sp) for i, sp in enumerate(spans)) return sentence
def load_keyphrases(cls, collection: Collection, finput: Path): cls.load_input(collection, finput) input_a_file = finput.parent / ("output_a_" + finput.name.split("_")[1]) sentences_length = [len(s.text) for s in collection.sentences] for i in range(1, len(sentences_length)): sentences_length[i] += sentences_length[i - 1] + 1 sentence_by_id = {} for line in input_a_file.open(encoding="utf8").readlines(): lid, spans, label, _ = line.strip().split("\t") lid = int(lid) spans = [s.split() for s in spans.split(";")] spans = [(int(start), int(end)) for start, end in spans] # find the sentence where this annotation is i = bisect.bisect(sentences_length, spans[0][0]) # correct the annotation spans if i > 0: spans = [ ( start - sentences_length[i - 1] - 1, end - sentences_length[i - 1] - 1, ) for start, end in spans ] spans.sort(key=lambda t: t[0]) # store the annotation in the corresponding sentence the_sentence = collection.sentences[i] keyphrase = Keyphrase(the_sentence, label, lid, spans) the_sentence.keyphrases.append(keyphrase) if len(keyphrase.spans) == 1: keyphrase.split() sentence_by_id[lid] = the_sentence return sentence_by_id
def run(self, collection, taskA, taskB): gold_keyphrases, gold_relations = self.model if taskA: next_id = 0 for gold_keyphrase, label in gold_keyphrases.items(): for sentence in collection.sentences: text = sentence.text.lower() pattern = r"\b" + gold_keyphrase + r"\b" for match in re.finditer(pattern, text): keyphrase = Keyphrase(sentence, label, next_id, [match.span()]) keyphrase.split() next_id += 1 sentence.keyphrases.append(keyphrase) if taskB: for sentence in collection.sentences: for origin in sentence.keyphrases: origin_text = origin.text.lower() for destination in sentence.keyphrases: destination_text = destination.text.lower() try: label = gold_relations[origin_text, origin.label, destination_text, destination.label, ] except KeyError: continue relation = Relation(sentence, origin.id, destination.id, label) sentence.relations.append(relation) sentence.remove_dup_relations() return collection
def run_taskA_for_label(self, collection: Collection, entity_label: str, *args, **kargs): model = self.taskA_models[entity_label] print(f"Building dataset for {entity_label} ...") dataset = BILUOVSentencesDS([s.text for s in collection.sentences], language=self.nlp) print(f"Done!") with torch.no_grad(): for sid, (*s_features, _) in tqdm( enumerate(dataset.shallow_dataloader()), total=len(dataset), desc=entity_label, ): tokensxsentence = dataset.tokensxsentence[sid] output = model(s_features) output = model.decode(output) labels = [dataset.labels[x] for x in output] decoded = from_biluov(labels, tokensxsentence, spans=True) sentence = collection.sentences[sid] for spans in decoded: keyphrase = Keyphrase(sentence, entity_label, -1, spans) sentence.keyphrases.append(keyphrase)