def write_sentence_documents(sentences: List[str], labels: List[str], path: Path, labeled=True): typesystem = TypeSystem() cas = Cas(typesystem=typesystem) SentenceType = typesystem.create_type( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") SentimentType = typesystem.create_type("webanno.custom.Sentiment") typesystem.add_feature(type_=SentimentType, name="value", rangeTypeName="uima.cas.String") cas.sofa_string = " ".join(sentences) begin = 0 for sentence, label in zip(sentences, labels): end = begin + len(sentence) cas_sentence = SentenceType(begin=begin, end=end) sentiment_annotation = SentimentType(begin=begin, end=end, value=label) begin = end + 1 cas.add_annotation(cas_sentence) if labeled: cas.add_annotation(sentiment_annotation) cas.to_xmi(path, pretty_print=True) for sentence in cas.select(SENTENCE_TYPE): print(cas.get_covered_text(sentence))
def convert_single_file(input_paragraph_list: List[str], output_xmi_file: str) -> None: document_text = '\n'.join(input_paragraph_list) cas = Cas(typesystem=cassis.load_dkpro_core_typesystem()) cas.sofa_string = document_text print("----") print(document_text) print("----") token_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token') paragraph_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph') sentence_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence') total_doc_offset: int = 0 for paragraph_str in input_paragraph_list: this_paragraph_total_offset = total_doc_offset doc: Doc = nlp(paragraph_str) for token in doc: assert isinstance(token, Token) # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space) begin: int = total_doc_offset + token.idx end: int = total_doc_offset + token.idx + len(token) # annotate token -- only if it is not a space! if not token.is_space: cas.add_annotation(token_type.__call__(begin=begin, end=end)) total_doc_offset += len(paragraph_str) # annotate paragraph this_paragraph_annotation = paragraph_type.__call__( begin=this_paragraph_total_offset, end=total_doc_offset) cas.add_annotation(this_paragraph_annotation) # and for paragraph too; but how about the '\n' char? maybe +1? total_doc_offset += 1 # add sentences aligned exactly to paragraphs cas.add_annotation( sentence_type.__call__(begin=this_paragraph_annotation.begin, end=this_paragraph_annotation.end)) print([x.get_covered_text() for x in cas.select(paragraph_type.name)]) print([x.get_covered_text() for x in cas.select(sentence_type.name)]) print([x.get_covered_text() for x in cas.select(token_type.name)]) # create parent folder if not exists Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True) cas.to_xmi(output_xmi_file)
def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=False) -> str: """ Converts a rebuilt ContentItem into Apache UIMA/XMI format. The resulting file will be named after the content item's ID, adding the `.xmi` extension. :param ci: the content item to be converted :type ci: `impresso_commons.classes.ContentItem` :param output_dir: the path to the output directory :type output_dir: str :param typesystem_path: TypeSystem file containing defitions of annotation layers. :type typesystem_path: str """ with open(typesystem_path, "rb") as f: typesystem = load_typesystem(f) cas = Cas(typesystem=typesystem) cas.sofa_string = ci.fulltext cas.sofa_mime = 'text/plain' sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence' imgLinkType = 'webanno.custom.ImpressoImages' Sentence = typesystem.get_type(sentType) ImageLink = typesystem.get_type(imgLinkType) # create sentence-level annotations start_offset = 0 for break_offset in ci.lines: start = start_offset end = break_offset start_offset = break_offset cas.add_annotation(Sentence(begin=start, end=end)) iiif_links = compute_image_links(ci, iiif_links=iiif_mappings, pct=pct_coordinates) # inject the IIIF links into for iiif_link, start, end in iiif_links: cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link)) outfile_path = os.path.join(output_dir, f'{ci.id}.xmi') cas.to_xmi(outfile_path, pretty_print=True) return outfile_path
def generate_cas(self, typesystem: TypeSystem) -> Cas: cas = Cas(typesystem) cas.sofa_string = "x" * 130 types = [t for t in typesystem.get_types()] types.remove(cas.typesystem.get_type(TYPE_NAME_DOCUMENT_ANNOTATION)) self.rnd.shuffle(types) for n in range(0, self.size): for T in types: begin = self.rnd.randint(0, 100) end = self.rnd.randint(0, 30) + self.minimum_width fs = T(begin=begin, end=end) cas.add(fs) return cas
def export_annotated_texts_to_xmi(annotated_texts: List[AnnotatedText], type_system, file: str, xmi_file=None): cas = Cas(typesystem=type_system) current_start = 0 starts = [] sofa_string = '' # Create sofa string for annotated_text in annotated_texts: starts.append(current_start) text = annotated_text.text if not text.endswith('\n'): text += '\n' sofa_string += text current_start += len(text) cas.sofa_string = sofa_string # Tokens for annotated_text, start in zip(annotated_texts, starts): for token in annotated_text.tokens: annotation = cas.typesystem.get_type(TOKEN_NS)( begin=start + token.start, end=start + token.stop) cas.add_annotation(annotation) # Sentences for annotated_text, start in zip(annotated_texts, starts): annotation = cas.typesystem.get_type(SENTENCE_NS)( begin=start, end=start + len(annotated_text.text)) cas.add_annotation(annotation) # Annotations for annotated_text, start in zip(annotated_texts, starts): for annotation in annotated_text.annotations: annotation = cas.typesystem.get_type(NAMED_ENTITY_NS)( value=annotation.label, begin=start + annotation.start, end=start + annotation.stop) cas.add_annotation(annotation) # write with open(file, 'wb') as f: dump_cas_to_zip_file(cas, f, xmi_file=xmi_file)
def load_newsgroup_test_data() -> List[Cas]: twenty_test = fetch_20newsgroups(subset="test", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42) typesystem = build_typesystem() SentenceType = typesystem.get_type(SENTENCE_TYPE) result = [] for text in twenty_test.data[:5]: cas = Cas(typesystem=typesystem) cas.sofa_string = text begin = 0 end = len(text) cas.add_annotation(SentenceType(begin=begin, end=end)) result.append(cas) return result
def load_newsgroup_training_data() -> List[TrainingDocument]: twenty_train = fetch_20newsgroups(subset="train", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42) target_names = twenty_train.target_names typesystem = build_typesystem() SentenceType = typesystem.get_type(SENTENCE_TYPE) PredictedType = typesystem.get_type(PREDICTED_TYPE) docs = [] for i, (text, target) in enumerate(zip(twenty_train.data, twenty_train.target)): cas = Cas(typesystem=typesystem) cas.sofa_string = text begin = 0 end = len(text) cas.add_annotation(SentenceType(begin=begin, end=end)) cas.add_annotation(PredictedType(begin=begin, end=end, value=target_names[target])) doc = TrainingDocument(cas, f"doc_{i}", USER) docs.append(doc) return docs