def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): result = jieba.tokenize(cas.sofa_string) for tk in result: prediction = self.create_prediction(cas, layer, feature, tk[1], tk[2], tk[0]) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): stemmer = nltk.PorterStemmer() # For every token, steam it and create an annotation in the CAS for cas_token in self.iter_tokens(cas): stem = stemmer.stem(cas_token.get_covered_text()) begin = cas_token.begin end = begin + len(stem) prediction = self.create_prediction(cas, layer, feature, begin, end, stem) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): for i, sentence in enumerate(cas.select(SENTENCE_TYPE)): token_ids = self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(sentence.get_covered_text())) input_tensor = torch.tensor([token_ids]) # predict output tensor outputs = self._model(input_tensor, adapter_names=[self._adapter_internal_name]) # retrieve the predicted class label label_id = torch.argmax(outputs[0]).item() label = self._label_map[label_id] prediction = self.create_prediction(cas, layer, feature, sentence.begin, sentence.end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model: Optional[Pipeline] = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return for sentence in cas.select(SENTENCE_TYPE): predicted = model.predict([sentence.get_covered_text()])[0] prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, predicted) cas.add_annotation(prediction)
def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=False) -> str: """ Converts a rebuilt ContentItem into Apache UIMA/XMI format. The resulting file will be named after the content item's ID, adding the `.xmi` extension. :param ci: the content item to be converted :type ci: `impresso_commons.classes.ContentItem` :param output_dir: the path to the output directory :type output_dir: str :param typesystem_path: TypeSystem file containing defitions of annotation layers. :type typesystem_path: str """ with open(typesystem_path, "rb") as f: typesystem = load_typesystem(f) cas = Cas(typesystem=typesystem) cas.sofa_string = ci.fulltext cas.sofa_mime = 'text/plain' sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence' imgLinkType = 'webanno.custom.ImpressoImages' Sentence = typesystem.get_type(sentType) ImageLink = typesystem.get_type(imgLinkType) # create sentence-level annotations start_offset = 0 for break_offset in ci.lines: start = start_offset end = break_offset start_offset = break_offset cas.add_annotation(Sentence(begin=start, end=end)) iiif_links = compute_image_links(ci, iiif_links=iiif_mappings, pct=pct_coordinates) # inject the IIIF links into for iiif_link, start, end in iiif_links: cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link)) outfile_path = os.path.join(output_dir, f'{ci.id}.xmi') cas.to_xmi(outfile_path, pretty_print=True) return outfile_path
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return featurizer = self._get_featurizer() sentences = cas.select(SENTENCE_TYPE) featurized_sentences = featurizer.featurize([s.get_covered_text() for s in sentences]) predictions = model.predict(featurized_sentences) for sentence, featurized_sentence, label in zip(sentences, featurized_sentences, predictions): prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): for sentence in cas.select(SENTENCE_TYPE): cas_tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) tokens = [t.get_covered_text() for t in cas_tokens] grouped_bert_tokens = self._tokenize_bert(tokens) predictions = self._predict(grouped_bert_tokens) grouped_predictions = self._align_tokens(tokens, grouped_bert_tokens, predictions) for token, grouped_prediction in zip(cas_tokens, grouped_predictions): begin = token.begin end = token.end label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0] prediction = self.create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def _generate_candidates(self, cas: Cas, n: int): # We generate token n-grams for tokens in mit.windowed(cas.select(TOKEN_TYPE), n): begin = tokens[0].begin end = tokens[-1].end text = cas.sofa_string[begin:end] yield (begin, end, text)
def generate_cas(self, typesystem: TypeSystem) -> Cas: cas = Cas(typesystem) cas.sofa_string = "x" * 130 types = [t for t in typesystem.get_types()] types.remove(cas.typesystem.get_type(TYPE_NAME_DOCUMENT_ANNOTATION)) self.rnd.shuffle(types) for n in range(0, self.size): for T in types: begin = self.rnd.randint(0, 100) end = self.rnd.randint(0, 30) + self.minimum_width fs = T(begin=begin, end=end) cas.add(fs) return cas
def featurize_cas(fg: FeatureGenerator, cas: Cas) -> List: features = get_features() results = [] for qid, entity in enumerate(cas.select("webanno.custom.EntityLinking")): candidates = list( cas.select_covered("inception.internal.KbHandle", entity)) if len(candidates) == 0: continue for i, candidate in enumerate(candidates): if entity.iri == candidate.iri: gold_idx = i break else: continue sentences = list( cas.select_covering( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", entity)) assert len(sentences) == 1 sentence = sentences[0] mention = entity.get_covered_text().lower() context = sentence.get_covered_text().lower() l = len(context) # context = context[int(l * 0.25):int(l * 0.75)] for cid, candidate in enumerate(candidates): score = float(entity.iri == candidate.iri) query = candidate.query label = candidate.label.lower() result = fg.featurize_candidate(qid, cid, "inception_rank", score, mention, context, label or "", candidate.description or "", entity.iri, gold_idx, candidate.iri, features) result.update(fg.featurize_query(mention, query, label)) results.append(result) return results
def extract(self, cas: Cas) -> (str, float): studentView = cas.get_view(FeatureExtractor.STUDENT_ANSWER_VIEW) variety = 0.0 for al in AlignmentLabel: variety += self.get_perc_of_mapping_type(studentView, al) return ("Variety", variety)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model: Optional[sklearn_crfsuite.CRF] = self._load_model(user_id) if model is None: logger.debug("No trained model ready yet!") return all_tokens = [] featurized_sentences = [] for sentence in cas.select(SENTENCE_TYPE): tokens = list(cas.select_covered(TOKEN_TYPE, sentence)) words = [token.get_covered_text() for token in tokens] all_tokens.append(tokens) featurized_sentences.append(self._sent2features(words)) all_predictions = model.predict(featurized_sentences) assert len(all_predictions) == len(all_tokens) for predictions, tokens in zip(all_predictions, all_tokens): assert len(predictions) == len(tokens) begin = None end = None prev_tag = "O" for tag, token in zip(predictions, tokens): if begin is not None and end is not None: if tag == "O" or (tag.startswith("B") and prev_tag.startswith("I")): prediction = create_prediction(cas, layer, feature, begin, end, "X") cas.add_annotation(prediction) if tag.startswith("B"): begin = token.begin end = token.end elif tag.startswith("I"): end = token.end else: begin = None end = None prev_tag = tag
def load_newsgroup_test_data() -> List[Cas]: twenty_test = fetch_20newsgroups(subset="test", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42) typesystem = build_typesystem() SentenceType = typesystem.get_type(SENTENCE_TYPE) result = [] for text in twenty_test.data[:5]: cas = Cas(typesystem=typesystem) cas.sofa_string = text begin = 0 end = len(text) cas.add_annotation(SentenceType(begin=begin, end=end)) result.append(cas) return result
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): model = self._load_model(user_id) if model is None: return le, items = model m = Map.from_iter(items) # We iterate over the all candidates and check whether they match for (begin, end, term) in chain( self._generate_candidates(cas, 3), self._generate_candidates(cas, 2), self._generate_candidates(cas, 1) ): for mention, label_id in m.search(term=term, max_dist=2): label = le.inverse_transform([label_id])[0] prediction = create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def iter_sentences(self, cas: Cas) -> Iterator[FeatureStructure]: """ Returns an iterator over all sentences in the given document. Args: cas: Returns: """ return cas.select(SENTENCE_TYPE)
def iter_tokens(self, cas: Cas) -> Iterator[FeatureStructure]: """ Returns an iterator over all tokens in the given document. Args: cas: Returns: """ return cas.select(TOKEN_TYPE)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it words = [ cas.get_covered_text(cas_token) for cas_token in self.iter_tokens(cas) ] doc = Doc(self._model.vocab, words=words) # Find the named entities self._model.tagger(doc) # For every token, extract the POS tag and create an annotation in the CAS for cas_token, spacy_token in zip(self.iter_tokens(cas), doc): prediction = self.create_prediction(cas, layer, feature, cas_token.begin, cas_token.end, spacy_token.pos_) cas.add_annotation(prediction)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it cas_tokens = cas.select(TOKEN_TYPE) words = [cas.get_covered_text(cas_token) for cas_token in cas_tokens] doc = Doc(self._model.vocab, words=words) # Find the named entities self._model.get_pipe("ner")(doc) # For every entity returned by spacy, create an annotation in the CAS for named_entity in doc.ents: begin = cas_tokens[named_entity.start].begin end = cas_tokens[named_entity.end - 1].end label = named_entity.label_ prediction = create_prediction(cas, layer, feature, begin, end, label) cas.add_annotation(prediction)
def export_annotated_texts_to_xmi(annotated_texts: List[AnnotatedText], type_system, file: str, xmi_file=None): cas = Cas(typesystem=type_system) current_start = 0 starts = [] sofa_string = '' # Create sofa string for annotated_text in annotated_texts: starts.append(current_start) text = annotated_text.text if not text.endswith('\n'): text += '\n' sofa_string += text current_start += len(text) cas.sofa_string = sofa_string # Tokens for annotated_text, start in zip(annotated_texts, starts): for token in annotated_text.tokens: annotation = cas.typesystem.get_type(TOKEN_NS)( begin=start + token.start, end=start + token.stop) cas.add_annotation(annotation) # Sentences for annotated_text, start in zip(annotated_texts, starts): annotation = cas.typesystem.get_type(SENTENCE_NS)( begin=start, end=start + len(annotated_text.text)) cas.add_annotation(annotation) # Annotations for annotated_text, start in zip(annotated_texts, starts): for annotation in annotated_text.annotations: annotation = cas.typesystem.get_type(NAMED_ENTITY_NS)( value=annotation.label, begin=start + annotation.start, end=start + annotation.stop) cas.add_annotation(annotation) # write with open(file, 'wb') as f: dump_cas_to_zip_file(cas, f, xmi_file=xmi_file)
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): # Extract the tokens from the CAS and create a spacy doc from it words = [ cas.get_covered_text(cas_token) for cas_token in cas.select(TOKEN_TYPE) ] doc = Doc(self._model.vocab, words=words) # Get the pos tags self._model.get_pipe("tok2vec")(doc) self._model.get_pipe("tagger")(doc) # For every token, extract the POS tag and create an annotation in the CAS for cas_token, spacy_token in zip(cas.select(TOKEN_TYPE), doc): prediction = create_prediction(cas, layer, feature, cas_token.begin, cas_token.end, spacy_token.tag_) cas.add_annotation(prediction)
def generate_cas(self, typesystem: TypeSystem) -> Cas: feature_structures = [] cas = Cas(typesystem) for i in range(0, self.size): feature_structures.append(self._makeAkof(cas)) # Randomly link feature structures to each other FSArray = cas.typesystem.get_type(TYPE_NAME_FS_ARRAY) for fs in feature_structures: fs.akofFs = self.rnd.choice(feature_structures) fs.akofAFs = FSArray(elements=[ self.rnd.choice(feature_structures) for i in range(0, self.rnd.randint(1, 3)) ]) cas.add_all(feature_structures) return cas
def extract(self, cas: Cas) -> (str, float): studentView = cas.get_view(FeatureExtractor.STUDENT_ANSWER_VIEW) answer = next(studentView.select(FeatureExtractor.ANSWER_TYPE)) score = answer.contentScore if score in Outcome.LABEL2INT: return ("Outcome", Outcome.LABEL2INT[score]) else: try: outcome = float(score) except ValueError: outcome = float('nan') return ("Outcome", outcome)
def cas_to_comparable_text( cas: Cas, out: [IOBase, None] = None, seeds: Iterable[FeatureStructure] = None, mark_indexed: bool = True, covered_text: bool = True, exclude_types: Set[str] = None, ) -> [str, None]: indexed_feature_structures = _get_indexed_feature_structures(cas) all_feature_structures_by_type = _group_feature_structures_by_type( cas._find_all_fs(seeds=seeds)) types_sorted = sorted(all_feature_structures_by_type.keys()) fs_id_to_anchor = _generate_anchors(cas, types_sorted, all_feature_structures_by_type, indexed_feature_structures, mark_indexed=mark_indexed) if not out: out = StringIO() csv_writer = csv.writer(out, dialect=csv.unix_dialect) for t in types_sorted: if exclude_types and t in exclude_types: continue type_ = cas.typesystem.get_type(t) csv_writer.writerow([type_.name]) is_annotation_type = covered_text and cas.typesystem.subsumes( parent=TYPE_NAME_ANNOTATION, child=type_) csv_writer.writerow( _render_header(type_, covered_text=is_annotation_type)) feature_structures_of_type = all_feature_structures_by_type.get( type_.name) if not feature_structures_of_type: continue for fs in feature_structures_of_type: row_data = _render_feature_structure( type_, fs, fs_id_to_anchor, max_covered_text=30 if is_annotation_type else 0) csv_writer.writerow(row_data) return out.getvalue() or None
def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str): sentences = cas.select(SENTENCE_TYPE) src_tokens = cas.select_covered("webanno.custom.Base", sentences[0]) trg_tokens = cas.select_covered("webanno.custom.Base", sentences[1]) src_sentence = [e.get_covered_text() for e in src_tokens] trg_sentence = [e.get_covered_text() for e in trg_tokens] print(src_sentence) print(trg_sentence) alignments = self._aligner.get_word_aligns(src_sentence, trg_sentence) Relation = cas.typesystem.get_type(layer) print(list(Relation.all_features)) for matching_method in alignments: for source_idx, target_idx in alignments[matching_method]: src = src_tokens[source_idx] target = trg_tokens[target_idx] prediction = Relation( Governor=src, Dependent=target, begin=target.begin, end=target.end, inception_internal_predicted=True, ) # setattr(prediction, feature, f"{src.get_covered_text()} -> {target.get_covered_text()}") setattr(prediction, feature, "") print(source_idx, target_idx, prediction) cas.add_annotation(prediction) break
def extract(self, cas: Cas) -> (str, float): view = cas.get_view(self.view_name) dep_matches = len(list(view.select(TripleOverlap.DEP_MAPPING_TYPE))) dep_rels = 0 for d in view.select(FeatureExtractor.DEPENDENCY_TYPE): if d.DependencyType in self.english_arg_rels: dep_rels += 1 ret = -1.0 if not dep_rels: ret = 0.0 else: ret = dep_matches / dep_rels return (self.view_name + "-Triple-Overlap", ret)
def test_import_cas(document_collection, requests_mock): requests_mock.post( f"{API_BASE}/importer/projects/{PROJECT_NAME}/documentCollections/test-collection/documents", json={ "payload": { "original_document_name": "text1.xmi", "document_name": "text1.xmi" }, "errorMessages": [], }, ) cas = Cas(typesystem=TypeSystem()) result = document_collection.import_documents(cas, filename="text1.xmi") assert result[0]["document_name"] == "text1.xmi"
def extract(self, cas: Cas) -> (str, float): view = cas.get_view(self.view_name) matched_ann = 0 all_ann = 0 for a in view.select(self.ann_type): all_ann += 1 mappable = self.get_mappable_ann(view, a) if mappable.match: matched_ann += 1 ret = -1.0 if not all_ann: ret = 0.0 else: ret = matched_ann / all_ann return (self.view_name + "-" + uima.simple_type_name(self.ann_type) + "-Overlap", ret)
def get_perc_of_mapping_type(self, cas: Cas, alignment: AlignmentLabel) -> float: overallMatchesCount = 0 itemsOfGivenTypeCount = 0 for t in cas.select(FeatureExtractor.TOKEN_TYPE): item = self.get_mappable_ann(cas, t) # check for matches/alignment if item.match is not None and item.match.target is not None: overallMatchesCount += 1 # check for types if item.match.label == alignment.name: itemsOfGivenTypeCount += 1 # if nothing has been matched at all, the result is 0 if overallMatchesCount == 0: return 0.0 else: return itemsOfGivenTypeCount / overallMatchesCount
def load_newsgroup_training_data() -> List[TrainingDocument]: twenty_train = fetch_20newsgroups(subset="train", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42) target_names = twenty_train.target_names typesystem = build_typesystem() SentenceType = typesystem.get_type(SENTENCE_TYPE) PredictedType = typesystem.get_type(PREDICTED_TYPE) docs = [] for i, (text, target) in enumerate(zip(twenty_train.data, twenty_train.target)): cas = Cas(typesystem=typesystem) cas.sofa_string = text begin = 0 end = len(text) cas.add_annotation(SentenceType(begin=begin, end=end)) cas.add_annotation(PredictedType(begin=begin, end=end, value=target_names[target])) doc = TrainingDocument(cas, f"doc_{i}", USER) docs.append(doc) return docs
def write_sentence_documents(sentences: List[str], labels: List[str], path: Path, labeled=True): typesystem = TypeSystem() cas = Cas(typesystem=typesystem) SentenceType = typesystem.create_type( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") SentimentType = typesystem.create_type("webanno.custom.Sentiment") typesystem.add_feature(type_=SentimentType, name="value", rangeTypeName="uima.cas.String") cas.sofa_string = " ".join(sentences) begin = 0 for sentence, label in zip(sentences, labels): end = begin + len(sentence) cas_sentence = SentenceType(begin=begin, end=end) sentiment_annotation = SentimentType(begin=begin, end=end, value=label) begin = end + 1 cas.add_annotation(cas_sentence) if labeled: cas.add_annotation(sentiment_annotation) cas.to_xmi(path, pretty_print=True) for sentence in cas.select(SENTENCE_TYPE): print(cas.get_covered_text(sentence))