def convert_single_file(input_paragraph_list: List[str], output_xmi_file: str) -> None: document_text = '\n'.join(input_paragraph_list) cas = Cas(typesystem=cassis.load_dkpro_core_typesystem()) cas.sofa_string = document_text print("----") print(document_text) print("----") token_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token') paragraph_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph') sentence_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence') total_doc_offset: int = 0 for paragraph_str in input_paragraph_list: this_paragraph_total_offset = total_doc_offset doc: Doc = nlp(paragraph_str) for token in doc: assert isinstance(token, Token) # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space) begin: int = total_doc_offset + token.idx end: int = total_doc_offset + token.idx + len(token) # annotate token -- only if it is not a space! if not token.is_space: cas.add_annotation(token_type.__call__(begin=begin, end=end)) total_doc_offset += len(paragraph_str) # annotate paragraph this_paragraph_annotation = paragraph_type.__call__( begin=this_paragraph_total_offset, end=total_doc_offset) cas.add_annotation(this_paragraph_annotation) # and for paragraph too; but how about the '\n' char? maybe +1? total_doc_offset += 1 # add sentences aligned exactly to paragraphs cas.add_annotation( sentence_type.__call__(begin=this_paragraph_annotation.begin, end=this_paragraph_annotation.end)) print([x.get_covered_text() for x in cas.select(paragraph_type.name)]) print([x.get_covered_text() for x in cas.select(sentence_type.name)]) print([x.get_covered_text() for x in cas.select(token_type.name)]) # create parent folder if not exists Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True) cas.to_xmi(output_xmi_file)
def write_sentence_documents(sentences: List[str], labels: List[str], path: Path, labeled=True): typesystem = TypeSystem() cas = Cas(typesystem=typesystem) SentenceType = typesystem.create_type( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") SentimentType = typesystem.create_type("webanno.custom.Sentiment") typesystem.add_feature(type_=SentimentType, name="value", rangeTypeName="uima.cas.String") cas.sofa_string = " ".join(sentences) begin = 0 for sentence, label in zip(sentences, labels): end = begin + len(sentence) cas_sentence = SentenceType(begin=begin, end=end) sentiment_annotation = SentimentType(begin=begin, end=end, value=label) begin = end + 1 cas.add_annotation(cas_sentence) if labeled: cas.add_annotation(sentiment_annotation) cas.to_xmi(path, pretty_print=True) for sentence in cas.select(SENTENCE_TYPE): print(cas.get_covered_text(sentence))
def _generate_candidates(self, cas: Cas, n: int): # We generate token n-grams for tokens in mit.windowed(cas.select(TOKEN_TYPE), n): begin = tokens[0].begin end = tokens[-1].end text = cas.sofa_string[begin:end] yield (begin, end, text)
def iter_tokens(self, cas: Cas) -> Iterator[FeatureStructure]: """ Returns an iterator over all tokens in the given document. Args: cas: Returns: """ return cas.select(TOKEN_TYPE)
def iter_sentences(self, cas: Cas) -> Iterator[FeatureStructure]: """ Returns an iterator over all sentences in the given document. Args: cas: Returns: """ return cas.select(SENTENCE_TYPE)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it words = [ cas.get_covered_text(cas_token) for cas_token in cas.select(TOKEN_TYPE) ] doc = Doc(self._model.vocab, words=words) # Get the pos tags self._model.get_pipe("tok2vec")(doc) self._model.get_pipe("tagger")(doc) # For every token, extract the POS tag and create an annotation in the CAS for cas_token, spacy_token in zip(cas.select(TOKEN_TYPE), doc): prediction = create_prediction(cas, layer, feature, cas_token.begin, cas_token.end, spacy_token.tag_) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): stemmer = nltk.PorterStemmer() # For every token, steam it and create an annotation in the CAS for cas_token in cas.select(TOKEN_TYPE): stem = stemmer.stem(cas_token.get_covered_text()) begin = cas_token.begin end = begin + len(stem) prediction = create_prediction(cas, layer, feature, begin, end, stem) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): for i, sentence in enumerate(cas.select(SENTENCE_TYPE)): token_ids = self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(sentence.get_covered_text())) input_tensor = torch.tensor([token_ids]) # predict output tensor outputs = self._model(input_tensor, adapter_names=[self._adapter_internal_name]) # retrieve the predicted class label label_id = torch.argmax(outputs[0]).item() label = self._label_map[label_id] prediction = self.create_prediction(cas, layer, feature, sentence.begin, sentence.end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model: Optional[Pipeline] = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return for sentence in cas.select(SENTENCE_TYPE): predicted = model.predict([sentence.get_covered_text()])[0] prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, predicted) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return featurizer = self._get_featurizer() sentences = cas.select(SENTENCE_TYPE) featurized_sentences = featurizer.featurize([s.get_covered_text() for s in sentences]) predictions = model.predict(featurized_sentences) for sentence, featurized_sentence, label in zip(sentences, featurized_sentences, predictions): prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): for sentence in cas.select(SENTENCE_TYPE): cas_tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) tokens = [t.get_covered_text() for t in cas_tokens] grouped_bert_tokens = self._tokenize_bert(tokens) predictions = self._predict(grouped_bert_tokens) grouped_predictions = self._align_tokens(tokens, grouped_bert_tokens, predictions) for token, grouped_prediction in zip(cas_tokens, grouped_predictions): begin = token.begin end = token.end label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0] prediction = self.create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def featurize_cas(fg: FeatureGenerator, cas: Cas) -> List: features = get_features() results = [] for qid, entity in enumerate(cas.select("webanno.custom.EntityLinking")): candidates = list( cas.select_covered("inception.internal.KbHandle", entity)) if len(candidates) == 0: continue for i, candidate in enumerate(candidates): if entity.iri == candidate.iri: gold_idx = i break else: continue sentences = list( cas.select_covering( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", entity)) assert len(sentences) == 1 sentence = sentences[0] mention = entity.get_covered_text().lower() context = sentence.get_covered_text().lower() l = len(context) # context = context[int(l * 0.25):int(l * 0.75)] for cid, candidate in enumerate(candidates): score = float(entity.iri == candidate.iri) query = candidate.query label = candidate.label.lower() result = fg.featurize_candidate(qid, cid, "inception_rank", score, mention, context, label or "", candidate.description or "", entity.iri, gold_idx, candidate.iri, features) result.update(fg.featurize_query(mention, query, label)) results.append(result) return results
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model: Optional[sklearn_crfsuite.CRF] = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return all_tokens = [] featurized_sentences = [] for sentence in cas.select(SENTENCE_TYPE): tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) words = [token.get_covered_text() for token in tokens] all_tokens.append(tokens) featurized_sentences.append(self._sent2features(words)) all_predictions = model.predict(featurized_sentences) assert len(all_predictions) == len(all_tokens) for predictions, tokens in zip(all_predictions, all_tokens): assert len(predictions) == len(tokens) begin = None end = None prev_tag = "O" for tag, token in zip(predictions, tokens): if begin is not None and end is not None: if tag == "O" or (tag.startswith("B") and prev_tag.startswith("I")): prediction = create_prediction(cas, layer, feature, begin, end, "X") cas.add_annotation(prediction) if tag.startswith("B"): begin = token.begin end = token.end elif tag.startswith("I"): end = token.end else: begin = None end = None prev_tag = tag
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it cas_tokens = cas.select(TOKEN_TYPE) words = [cas.get_covered_text(cas_token) for cas_token in cas_tokens] doc = Doc(self._model.vocab, words=words) # Find the named entities self._model.get_pipe("ner")(doc) # For every entity returned by spacy, create an annotation in the CAS for named_entity in doc.ents: begin = cas_tokens[named_entity.start].begin end = cas_tokens[named_entity.end - 1].end label = named_entity.label_ prediction = create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def get_perc_of_mapping_type(self, cas: Cas, alignment: AlignmentLabel) -> float: overallMatchesCount = 0 itemsOfGivenTypeCount = 0 for t in cas.select(FeatureExtractor.TOKEN_TYPE): item = self.get_mappable_ann(cas, t) # check for matches/alignment if item.match is not None and item.match.target is not None: overallMatchesCount += 1 # check for types if item.match.label == alignment.name: itemsOfGivenTypeCount += 1 # if nothing has been matched at all, the result is 0 if overallMatchesCount == 0: return 0.0 else: return itemsOfGivenTypeCount / overallMatchesCount
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): sentences = cas.select(SENTENCE_TYPE) src_tokens = cas.select_covered("webanno.custom.Base", sentences[0]) trg_tokens = cas.select_covered("webanno.custom.Base", sentences[1]) src_sentence = [e.get_covered_text() for e in src_tokens] trg_sentence = [e.get_covered_text() for e in trg_tokens] print(src_sentence) print(trg_sentence) alignments = self._aligner.get_word_aligns(src_sentence, trg_sentence) Relation = cas.typesystem.get_type(layer) print(list(Relation.all_features)) for matching_method in alignments: for source_idx, target_idx in alignments[matching_method]: src = src_tokens[source_idx] target = trg_tokens[target_idx] prediction = Relation( Governor=src, Dependent=target, begin=target.begin, end=target.end, inception_internal_predicted=True, ) # setattr(prediction, feature, f"{src.get_covered_text()} -> {target.get_covered_text()}") setattr(prediction, feature, "") print(source_idx, target_idx, prediction) cas.add_annotation(prediction) break