def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): for sentence in cas.select(SENTENCE_TYPE): cas_tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) tokens = [t.get_covered_text() for t in cas_tokens] grouped_bert_tokens = self._tokenize_bert(tokens) predictions = self._predict(grouped_bert_tokens) grouped_predictions = self._align_tokens(tokens, grouped_bert_tokens, predictions) for token, grouped_prediction in zip(cas_tokens, grouped_predictions): begin = token.begin end = token.end label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0] prediction = self.create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def featurize_cas(fg: FeatureGenerator, cas: Cas) -> List: features = get_features() results = [] for qid, entity in enumerate(cas.select("webanno.custom.EntityLinking")): candidates = list( cas.select_covered("inception.internal.KbHandle", entity)) if len(candidates) == 0: continue for i, candidate in enumerate(candidates): if entity.iri == candidate.iri: gold_idx = i break else: continue sentences = list( cas.select_covering( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", entity)) assert len(sentences) == 1 sentence = sentences[0] mention = entity.get_covered_text().lower() context = sentence.get_covered_text().lower() l = len(context) # context = context[int(l * 0.25):int(l * 0.75)] for cid, candidate in enumerate(candidates): score = float(entity.iri == candidate.iri) query = candidate.query label = candidate.label.lower() result = fg.featurize_candidate(qid, cid, "inception_rank", score, mention, context, label or "", candidate.description or "", entity.iri, gold_idx, candidate.iri, features) result.update(fg.featurize_query(mention, query, label)) results.append(result) return results
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model: Optional[sklearn_crfsuite.CRF] = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return all_tokens = [] featurized_sentences = [] for sentence in cas.select(SENTENCE_TYPE): tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) words = [token.get_covered_text() for token in tokens] all_tokens.append(tokens) featurized_sentences.append(self._sent2features(words)) all_predictions = model.predict(featurized_sentences) assert len(all_predictions) == len(all_tokens) for predictions, tokens in zip(all_predictions, all_tokens): assert len(predictions) == len(tokens) begin = None end = None prev_tag = "O" for tag, token in zip(predictions, tokens): if begin is not None and end is not None: if tag == "O" or (tag.startswith("B") and prev_tag.startswith("I")): prediction = create_prediction(cas, layer, feature, begin, end, "X") cas.add_annotation(prediction) if tag.startswith("B"): begin = token.begin end = token.end elif tag.startswith("I"): end = token.end else: begin = None end = None prev_tag = tag
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): sentences = cas.select(SENTENCE_TYPE) src_tokens = cas.select_covered("webanno.custom.Base", sentences[0]) trg_tokens = cas.select_covered("webanno.custom.Base", sentences[1]) src_sentence = [e.get_covered_text() for e in src_tokens] trg_sentence = [e.get_covered_text() for e in trg_tokens] print(src_sentence) print(trg_sentence) alignments = self._aligner.get_word_aligns(src_sentence, trg_sentence) Relation = cas.typesystem.get_type(layer) print(list(Relation.all_features)) for matching_method in alignments: for source_idx, target_idx in alignments[matching_method]: src = src_tokens[source_idx] target = trg_tokens[target_idx] prediction = Relation( Governor=src, Dependent=target, begin=target.begin, end=target.end, inception_internal_predicted=True, ) # setattr(prediction, feature, f"{src.get_covered_text()} -> {target.get_covered_text()}") setattr(prediction, feature, "") print(source_idx, target_idx, prediction) cas.add_annotation(prediction) break
def get_mappable_ann(self, cas: Cas, t: Type): return next(cas.select_covered(FeatureExtractor.MAPPABLE_TYPE, t))