def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): result = jieba.tokenize(cas.sofa_string) for tk in result: prediction = self.create_prediction(cas, layer, feature, tk[1], tk[2], tk[0]) cas.add_annotation(prediction)
def write_sentence_documents(sentences: List[str], labels: List[str], path: Path, labeled=True): typesystem = TypeSystem() cas = Cas(typesystem=typesystem) SentenceType = typesystem.create_type( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") SentimentType = typesystem.create_type("webanno.custom.Sentiment") typesystem.add_feature(type_=SentimentType, name="value", rangeTypeName="uima.cas.String") cas.sofa_string = " ".join(sentences) begin = 0 for sentence, label in zip(sentences, labels): end = begin + len(sentence) cas_sentence = SentenceType(begin=begin, end=end) sentiment_annotation = SentimentType(begin=begin, end=end, value=label) begin = end + 1 cas.add_annotation(cas_sentence) if labeled: cas.add_annotation(sentiment_annotation) cas.to_xmi(path, pretty_print=True) for sentence in cas.select(SENTENCE_TYPE): print(cas.get_covered_text(sentence))
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): stemmer = nltk.PorterStemmer() # For every token, steam it and create an annotation in the CAS for cas_token in self.iter_tokens(cas): stem = stemmer.stem(cas_token.get_covered_text()) begin = cas_token.begin end = begin + len(stem) prediction = self.create_prediction(cas, layer, feature, begin, end, stem) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): for i, sentence in enumerate(cas.select(SENTENCE_TYPE)): token_ids = self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(sentence.get_covered_text())) input_tensor = torch.tensor([token_ids]) # predict output tensor outputs = self._model(input_tensor, adapter_names=[self._adapter_internal_name]) # retrieve the predicted class label label_id = torch.argmax(outputs[0]).item() label = self._label_map[label_id] prediction = self.create_prediction(cas, layer, feature, sentence.begin, sentence.end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model: Optional[Pipeline] = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return for sentence in cas.select(SENTENCE_TYPE): predicted = model.predict([sentence.get_covered_text()])[0] prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, predicted) cas.add_annotation(prediction)
def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=False) -> str: """ Converts a rebuilt ContentItem into Apache UIMA/XMI format. The resulting file will be named after the content item's ID, adding the `.xmi` extension. :param ci: the content item to be converted :type ci: `impresso_commons.classes.ContentItem` :param output_dir: the path to the output directory :type output_dir: str :param typesystem_path: TypeSystem file containing defitions of annotation layers. :type typesystem_path: str """ with open(typesystem_path, "rb") as f: typesystem = load_typesystem(f) cas = Cas(typesystem=typesystem) cas.sofa_string = ci.fulltext cas.sofa_mime = 'text/plain' sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence' imgLinkType = 'webanno.custom.ImpressoImages' Sentence = typesystem.get_type(sentType) ImageLink = typesystem.get_type(imgLinkType) # create sentence-level annotations start_offset = 0 for break_offset in ci.lines: start = start_offset end = break_offset start_offset = break_offset cas.add_annotation(Sentence(begin=start, end=end)) iiif_links = compute_image_links(ci, iiif_links=iiif_mappings, pct=pct_coordinates) # inject the IIIF links into for iiif_link, start, end in iiif_links: cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link)) outfile_path = os.path.join(output_dir, f'{ci.id}.xmi') cas.to_xmi(outfile_path, pretty_print=True) return outfile_path
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return featurizer = self._get_featurizer() sentences = cas.select(SENTENCE_TYPE) featurized_sentences = featurizer.featurize([s.get_covered_text() for s in sentences]) predictions = model.predict(featurized_sentences) for sentence, featurized_sentence, label in zip(sentences, featurized_sentences, predictions): prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): for sentence in cas.select(SENTENCE_TYPE): cas_tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) tokens = [t.get_covered_text() for t in cas_tokens] grouped_bert_tokens = self._tokenize_bert(tokens) predictions = self._predict(grouped_bert_tokens) grouped_predictions = self._align_tokens(tokens, grouped_bert_tokens, predictions) for token, grouped_prediction in zip(cas_tokens, grouped_predictions): begin = token.begin end = token.end label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0] prediction = self.create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model: Optional[sklearn_crfsuite.CRF] = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return all_tokens = [] featurized_sentences = [] for sentence in cas.select(SENTENCE_TYPE): tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) words = [token.get_covered_text() for token in tokens] all_tokens.append(tokens) featurized_sentences.append(self._sent2features(words)) all_predictions = model.predict(featurized_sentences) assert len(all_predictions) == len(all_tokens) for predictions, tokens in zip(all_predictions, all_tokens): assert len(predictions) == len(tokens) begin = None end = None prev_tag = "O" for tag, token in zip(predictions, tokens): if begin is not None and end is not None: if tag == "O" or (tag.startswith("B") and prev_tag.startswith("I")): prediction = create_prediction(cas, layer, feature, begin, end, "X") cas.add_annotation(prediction) if tag.startswith("B"): begin = token.begin end = token.end elif tag.startswith("I"): end = token.end else: begin = None end = None prev_tag = tag
def load_newsgroup_test_data() -> List[Cas]: twenty_test = fetch_20newsgroups(subset="test", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42) typesystem = build_typesystem() SentenceType = typesystem.get_type(SENTENCE_TYPE) result = [] for text in twenty_test.data[:5]: cas = Cas(typesystem=typesystem) cas.sofa_string = text begin = 0 end = len(text) cas.add_annotation(SentenceType(begin=begin, end=end)) result.append(cas) return result
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model = self._load_model(user_id) if model is None: return le, items = model m = Map.from_iter(items) # We iterate over the all candidates and check whether they match for (begin, end, term) in chain( self._generate_candidates(cas, 3), self._generate_candidates(cas, 2), self._generate_candidates(cas, 1) ): for mention, label_id in m.search(term=term, max_dist=2): label = le.inverse_transform([label_id])[0] prediction = create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it cas_tokens = cas.select(TOKEN_TYPE) words = [cas.get_covered_text(cas_token) for cas_token in cas_tokens] doc = Doc(self._model.vocab, words=words) # Find the named entities self._model.get_pipe("ner")(doc) # For every entity returned by spacy, create an annotation in the CAS for named_entity in doc.ents: begin = cas_tokens[named_entity.start].begin end = cas_tokens[named_entity.end - 1].end label = named_entity.label_ prediction = create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it words = [ cas.get_covered_text(cas_token) for cas_token in self.iter_tokens(cas) ] doc = Doc(self._model.vocab, words=words) # Find the named entities self._model.tagger(doc) # For every token, extract the POS tag and create an annotation in the CAS for cas_token, spacy_token in zip(self.iter_tokens(cas), doc): prediction = self.create_prediction(cas, layer, feature, cas_token.begin, cas_token.end, spacy_token.pos_) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it words = [ cas.get_covered_text(cas_token) for cas_token in cas.select(TOKEN_TYPE) ] doc = Doc(self._model.vocab, words=words) # Get the pos tags self._model.get_pipe("tok2vec")(doc) self._model.get_pipe("tagger")(doc) # For every token, extract the POS tag and create an annotation in the CAS for cas_token, spacy_token in zip(cas.select(TOKEN_TYPE), doc): prediction = create_prediction(cas, layer, feature, cas_token.begin, cas_token.end, spacy_token.tag_) cas.add_annotation(prediction)
def load_newsgroup_training_data() -> List[TrainingDocument]: twenty_train = fetch_20newsgroups(subset="train", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42) target_names = twenty_train.target_names typesystem = build_typesystem() SentenceType = typesystem.get_type(SENTENCE_TYPE) PredictedType = typesystem.get_type(PREDICTED_TYPE) docs = [] for i, (text, target) in enumerate(zip(twenty_train.data, twenty_train.target)): cas = Cas(typesystem=typesystem) cas.sofa_string = text begin = 0 end = len(text) cas.add_annotation(SentenceType(begin=begin, end=end)) cas.add_annotation(PredictedType(begin=begin, end=end, value=target_names[target])) doc = TrainingDocument(cas, f"doc_{i}", USER) docs.append(doc) return docs
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): sentences = cas.select(SENTENCE_TYPE) src_tokens = cas.select_covered("webanno.custom.Base", sentences[0]) trg_tokens = cas.select_covered("webanno.custom.Base", sentences[1]) src_sentence = [e.get_covered_text() for e in src_tokens] trg_sentence = [e.get_covered_text() for e in trg_tokens] print(src_sentence) print(trg_sentence) alignments = self._aligner.get_word_aligns(src_sentence, trg_sentence) Relation = cas.typesystem.get_type(layer) print(list(Relation.all_features)) for matching_method in alignments: for source_idx, target_idx in alignments[matching_method]: src = src_tokens[source_idx] target = trg_tokens[target_idx] prediction = Relation( Governor=src, Dependent=target, begin=target.begin, end=target.end, inception_internal_predicted=True, ) # setattr(prediction, feature, f"{src.get_covered_text()} -> {target.get_covered_text()}") setattr(prediction, feature, "") print(source_idx, target_idx, prediction) cas.add_annotation(prediction) break
def convert_single_file(input_paragraph_list: List[str], output_xmi_file: str) -> None: document_text = '\n'.join(input_paragraph_list) cas = Cas(typesystem=cassis.load_dkpro_core_typesystem()) cas.sofa_string = document_text print("----") print(document_text) print("----") token_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token') paragraph_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph') sentence_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence') total_doc_offset: int = 0 for paragraph_str in input_paragraph_list: this_paragraph_total_offset = total_doc_offset doc: Doc = nlp(paragraph_str) for token in doc: assert isinstance(token, Token) # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space) begin: int = total_doc_offset + token.idx end: int = total_doc_offset + token.idx + len(token) # annotate token -- only if it is not a space! if not token.is_space: cas.add_annotation(token_type.__call__(begin=begin, end=end)) total_doc_offset += len(paragraph_str) # annotate paragraph this_paragraph_annotation = paragraph_type.__call__( begin=this_paragraph_total_offset, end=total_doc_offset) cas.add_annotation(this_paragraph_annotation) # and for paragraph too; but how about the '\n' char? maybe +1? total_doc_offset += 1 # add sentences aligned exactly to paragraphs cas.add_annotation( sentence_type.__call__(begin=this_paragraph_annotation.begin, end=this_paragraph_annotation.end)) print([x.get_covered_text() for x in cas.select(paragraph_type.name)]) print([x.get_covered_text() for x in cas.select(sentence_type.name)]) print([x.get_covered_text() for x in cas.select(token_type.name)]) # create parent folder if not exists Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True) cas.to_xmi(output_xmi_file)
def export_annotated_texts_to_xmi(annotated_texts: List[AnnotatedText], type_system, file: str, xmi_file=None): cas = Cas(typesystem=type_system) current_start = 0 starts = [] sofa_string = '' # Create sofa string for annotated_text in annotated_texts: starts.append(current_start) text = annotated_text.text if not text.endswith('\n'): text += '\n' sofa_string += text current_start += len(text) cas.sofa_string = sofa_string # Tokens for annotated_text, start in zip(annotated_texts, starts): for token in annotated_text.tokens: annotation = cas.typesystem.get_type(TOKEN_NS)( begin=start + token.start, end=start + token.stop) cas.add_annotation(annotation) # Sentences for annotated_text, start in zip(annotated_texts, starts): annotation = cas.typesystem.get_type(SENTENCE_NS)( begin=start, end=start + len(annotated_text.text)) cas.add_annotation(annotation) # Annotations for annotated_text, start in zip(annotated_texts, starts): for annotation in annotated_text.annotations: annotation = cas.typesystem.get_type(NAMED_ENTITY_NS)( value=annotation.label, begin=start + annotation.start, end=start + annotation.stop) cas.add_annotation(annotation) # write with open(file, 'wb') as f: dump_cas_to_zip_file(cas, f, xmi_file=xmi_file)