def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
             document_id: str, user_id: str):
     result = jieba.tokenize(cas.sofa_string)
     for tk in result:
         prediction = self.create_prediction(cas, layer, feature, tk[1],
                                             tk[2], tk[0])
         cas.add_annotation(prediction)
Ejemplo n.º 2
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        stemmer = nltk.PorterStemmer()

        # For every token, steam it and create an annotation in the CAS
        for cas_token in self.iter_tokens(cas):
            stem = stemmer.stem(cas_token.get_covered_text())
            begin = cas_token.begin
            end = begin + len(stem)
            prediction = self.create_prediction(cas, layer, feature, begin, end, stem)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        for i, sentence in enumerate(cas.select(SENTENCE_TYPE)):
            token_ids = self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(sentence.get_covered_text()))
            input_tensor = torch.tensor([token_ids])

            # predict output tensor
            outputs = self._model(input_tensor, adapter_names=[self._adapter_internal_name])

            # retrieve the predicted class label
            label_id = torch.argmax(outputs[0]).item()
            label = self._label_map[label_id]
            prediction = self.create_prediction(cas, layer, feature, sentence.begin, sentence.end, label)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        model: Optional[Pipeline] = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        for sentence in cas.select(SENTENCE_TYPE):
            predicted = model.predict([sentence.get_covered_text()])[0]
            prediction = create_prediction(cas, layer, feature, sentence.begin,
                                           sentence.end, predicted)
            cas.add_annotation(prediction)
Ejemplo n.º 5
0
def rebuilt2xmi(ci,
                output_dir,
                typesystem_path,
                iiif_mappings,
                pct_coordinates=False) -> str:
    """
    Converts a rebuilt ContentItem into Apache UIMA/XMI format.

    The resulting file will be named after the content item's ID, adding
    the `.xmi` extension.

    :param ci: the content item to be converted
    :type ci: `impresso_commons.classes.ContentItem`
    :param output_dir: the path to the output directory
    :type output_dir: str
    :param typesystem_path: TypeSystem file containing defitions of annotation
    layers.
    :type typesystem_path: str
    """

    with open(typesystem_path, "rb") as f:
        typesystem = load_typesystem(f)

    cas = Cas(typesystem=typesystem)
    cas.sofa_string = ci.fulltext
    cas.sofa_mime = 'text/plain'

    sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
    imgLinkType = 'webanno.custom.ImpressoImages'
    Sentence = typesystem.get_type(sentType)
    ImageLink = typesystem.get_type(imgLinkType)

    # create sentence-level annotations
    start_offset = 0
    for break_offset in ci.lines:
        start = start_offset
        end = break_offset
        start_offset = break_offset
        cas.add_annotation(Sentence(begin=start, end=end))

    iiif_links = compute_image_links(ci,
                                     iiif_links=iiif_mappings,
                                     pct=pct_coordinates)

    # inject the IIIF links into
    for iiif_link, start, end in iiif_links:
        cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link))

    outfile_path = os.path.join(output_dir, f'{ci.id}.xmi')
    cas.to_xmi(outfile_path, pretty_print=True)
    return outfile_path
Ejemplo n.º 6
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        model = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        featurizer = self._get_featurizer()
        sentences = cas.select(SENTENCE_TYPE)
        featurized_sentences = featurizer.featurize([s.get_covered_text() for s in sentences])
        predictions = model.predict(featurized_sentences)

        for sentence, featurized_sentence, label in zip(sentences, featurized_sentences, predictions):
            prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        for sentence in cas.select(SENTENCE_TYPE):
            cas_tokens = list(cas.select_covered(TOKEN_TYPE, sentence))
            tokens = [t.get_covered_text() for t in cas_tokens]

            grouped_bert_tokens = self._tokenize_bert(tokens)
            predictions = self._predict(grouped_bert_tokens)

            grouped_predictions = self._align_tokens(tokens, grouped_bert_tokens, predictions)

            for token, grouped_prediction in zip(cas_tokens, grouped_predictions):
                begin = token.begin
                end = token.end
                label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0]
                prediction = self.create_prediction(cas, layer, feature, begin, end, label)
                cas.add_annotation(prediction)
 def _generate_candidates(self, cas: Cas, n: int):
     # We generate token n-grams
     for tokens in mit.windowed(cas.select(TOKEN_TYPE), n):
         begin = tokens[0].begin
         end = tokens[-1].end
         text = cas.sofa_string[begin:end]
         yield (begin, end, text)
Ejemplo n.º 9
0
    def generate_cas(self, typesystem: TypeSystem) -> Cas:
        cas = Cas(typesystem)
        cas.sofa_string = "x" * 130

        types = [t for t in typesystem.get_types()]
        types.remove(cas.typesystem.get_type(TYPE_NAME_DOCUMENT_ANNOTATION))
        self.rnd.shuffle(types)

        for n in range(0, self.size):
            for T in types:
                begin = self.rnd.randint(0, 100)
                end = self.rnd.randint(0, 30) + self.minimum_width
                fs = T(begin=begin, end=end)
                cas.add(fs)

        return cas
def featurize_cas(fg: FeatureGenerator, cas: Cas) -> List:
    features = get_features()

    results = []

    for qid, entity in enumerate(cas.select("webanno.custom.EntityLinking")):
        candidates = list(
            cas.select_covered("inception.internal.KbHandle", entity))

        if len(candidates) == 0:
            continue

        for i, candidate in enumerate(candidates):
            if entity.iri == candidate.iri:
                gold_idx = i
                break
        else:
            continue

        sentences = list(
            cas.select_covering(
                "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
                entity))
        assert len(sentences) == 1
        sentence = sentences[0]

        mention = entity.get_covered_text().lower()
        context = sentence.get_covered_text().lower()
        l = len(context)
        # context = context[int(l * 0.25):int(l * 0.75)]

        for cid, candidate in enumerate(candidates):
            score = float(entity.iri == candidate.iri)
            query = candidate.query
            label = candidate.label.lower()

            result = fg.featurize_candidate(qid, cid, "inception_rank", score,
                                            mention, context, label or "",
                                            candidate.description or "",
                                            entity.iri, gold_idx,
                                            candidate.iri, features)

            result.update(fg.featurize_query(mention, query, label))

            results.append(result)

    return results
Ejemplo n.º 11
0
    def extract(self, cas: Cas) -> (str, float):
        studentView = cas.get_view(FeatureExtractor.STUDENT_ANSWER_VIEW)

        variety = 0.0
        for al in AlignmentLabel:
            variety += self.get_perc_of_mapping_type(studentView, al)

        return ("Variety", variety)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        model: Optional[sklearn_crfsuite.CRF] = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        all_tokens = []
        featurized_sentences = []

        for sentence in cas.select(SENTENCE_TYPE):
            tokens = list(cas.select_covered(TOKEN_TYPE, sentence))
            words = [token.get_covered_text() for token in tokens]

            all_tokens.append(tokens)
            featurized_sentences.append(self._sent2features(words))

        all_predictions = model.predict(featurized_sentences)

        assert len(all_predictions) == len(all_tokens)
        for predictions, tokens in zip(all_predictions, all_tokens):
            assert len(predictions) == len(tokens)

            begin = None
            end = None
            prev_tag = "O"
            for tag, token in zip(predictions, tokens):
                if begin is not None and end is not None:
                    if tag == "O" or (tag.startswith("B")
                                      and prev_tag.startswith("I")):
                        prediction = create_prediction(cas, layer, feature,
                                                       begin, end, "X")
                        cas.add_annotation(prediction)

                if tag.startswith("B"):
                    begin = token.begin
                    end = token.end
                elif tag.startswith("I"):
                    end = token.end
                else:
                    begin = None
                    end = None

                prev_tag = tag
Ejemplo n.º 13
0
def load_newsgroup_test_data() -> List[Cas]:
    twenty_test = fetch_20newsgroups(subset="test", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42)

    typesystem = build_typesystem()
    SentenceType = typesystem.get_type(SENTENCE_TYPE)

    result = []
    for text in twenty_test.data[:5]:
        cas = Cas(typesystem=typesystem)
        cas.sofa_string = text

        begin = 0
        end = len(text)
        cas.add_annotation(SentenceType(begin=begin, end=end))

        result.append(cas)

    return result
Ejemplo n.º 14
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        model = self._load_model(user_id)

        if model is None:
            return

        le, items = model

        m = Map.from_iter(items)

        # We iterate over the all candidates and check whether they match
        for (begin, end, term) in chain(
            self._generate_candidates(cas, 3), self._generate_candidates(cas, 2), self._generate_candidates(cas, 1)
        ):
            for mention, label_id in m.search(term=term, max_dist=2):
                label = le.inverse_transform([label_id])[0]
                prediction = create_prediction(cas, layer, feature, begin, end, label)
                cas.add_annotation(prediction)
Ejemplo n.º 15
0
    def iter_sentences(self, cas: Cas) -> Iterator[FeatureStructure]:
        """ Returns an iterator over all sentences in the given document.

        Args:
            cas:

        Returns:

        """
        return cas.select(SENTENCE_TYPE)
Ejemplo n.º 16
0
    def iter_tokens(self, cas: Cas) -> Iterator[FeatureStructure]:
        """ Returns an iterator over all tokens in the given document.

        Args:
            cas:

        Returns:

        """
        return cas.select(TOKEN_TYPE)
Ejemplo n.º 17
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        words = [
            cas.get_covered_text(cas_token)
            for cas_token in self.iter_tokens(cas)
        ]

        doc = Doc(self._model.vocab, words=words)

        # Find the named entities
        self._model.tagger(doc)

        # For every token, extract the POS tag and create an annotation in the CAS
        for cas_token, spacy_token in zip(self.iter_tokens(cas), doc):
            prediction = self.create_prediction(cas, layer, feature,
                                                cas_token.begin, cas_token.end,
                                                spacy_token.pos_)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        cas_tokens = cas.select(TOKEN_TYPE)
        words = [cas.get_covered_text(cas_token) for cas_token in cas_tokens]

        doc = Doc(self._model.vocab, words=words)

        # Find the named entities
        self._model.get_pipe("ner")(doc)

        # For every entity returned by spacy, create an annotation in the CAS
        for named_entity in doc.ents:
            begin = cas_tokens[named_entity.start].begin
            end = cas_tokens[named_entity.end - 1].end
            label = named_entity.label_
            prediction = create_prediction(cas, layer, feature, begin, end,
                                           label)
            cas.add_annotation(prediction)
def export_annotated_texts_to_xmi(annotated_texts: List[AnnotatedText],
                                  type_system,
                                  file: str,
                                  xmi_file=None):

    cas = Cas(typesystem=type_system)

    current_start = 0
    starts = []
    sofa_string = ''

    # Create sofa string
    for annotated_text in annotated_texts:
        starts.append(current_start)
        text = annotated_text.text
        if not text.endswith('\n'):
            text += '\n'
        sofa_string += text
        current_start += len(text)

    cas.sofa_string = sofa_string

    # Tokens
    for annotated_text, start in zip(annotated_texts, starts):
        for token in annotated_text.tokens:
            annotation = cas.typesystem.get_type(TOKEN_NS)(
                begin=start + token.start, end=start + token.stop)
            cas.add_annotation(annotation)

    # Sentences
    for annotated_text, start in zip(annotated_texts, starts):
        annotation = cas.typesystem.get_type(SENTENCE_NS)(
            begin=start, end=start + len(annotated_text.text))
        cas.add_annotation(annotation)

    # Annotations
    for annotated_text, start in zip(annotated_texts, starts):
        for annotation in annotated_text.annotations:
            annotation = cas.typesystem.get_type(NAMED_ENTITY_NS)(
                value=annotation.label,
                begin=start + annotation.start,
                end=start + annotation.stop)
            cas.add_annotation(annotation)

    # write
    with open(file, 'wb') as f:
        dump_cas_to_zip_file(cas, f, xmi_file=xmi_file)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        words = [
            cas.get_covered_text(cas_token)
            for cas_token in cas.select(TOKEN_TYPE)
        ]

        doc = Doc(self._model.vocab, words=words)

        # Get the pos tags
        self._model.get_pipe("tok2vec")(doc)
        self._model.get_pipe("tagger")(doc)

        # For every token, extract the POS tag and create an annotation in the CAS
        for cas_token, spacy_token in zip(cas.select(TOKEN_TYPE), doc):
            prediction = create_prediction(cas, layer, feature,
                                           cas_token.begin, cas_token.end,
                                           spacy_token.tag_)
            cas.add_annotation(prediction)
Ejemplo n.º 21
0
    def generate_cas(self, typesystem: TypeSystem) -> Cas:
        feature_structures = []

        cas = Cas(typesystem)

        for i in range(0, self.size):
            feature_structures.append(self._makeAkof(cas))

        # Randomly link feature structures to each other
        FSArray = cas.typesystem.get_type(TYPE_NAME_FS_ARRAY)
        for fs in feature_structures:
            fs.akofFs = self.rnd.choice(feature_structures)
            fs.akofAFs = FSArray(elements=[
                self.rnd.choice(feature_structures)
                for i in range(0, self.rnd.randint(1, 3))
            ])

        cas.add_all(feature_structures)

        return cas
Ejemplo n.º 22
0
    def extract(self, cas: Cas) -> (str, float):
        studentView = cas.get_view(FeatureExtractor.STUDENT_ANSWER_VIEW)
        answer = next(studentView.select(FeatureExtractor.ANSWER_TYPE))
        score = answer.contentScore

        if score in Outcome.LABEL2INT:
            return ("Outcome", Outcome.LABEL2INT[score])
        else:
            try:
                outcome = float(score)
            except ValueError:
                outcome = float('nan')
            return ("Outcome", outcome)
Ejemplo n.º 23
0
def cas_to_comparable_text(
    cas: Cas,
    out: [IOBase, None] = None,
    seeds: Iterable[FeatureStructure] = None,
    mark_indexed: bool = True,
    covered_text: bool = True,
    exclude_types: Set[str] = None,
) -> [str, None]:
    indexed_feature_structures = _get_indexed_feature_structures(cas)
    all_feature_structures_by_type = _group_feature_structures_by_type(
        cas._find_all_fs(seeds=seeds))
    types_sorted = sorted(all_feature_structures_by_type.keys())
    fs_id_to_anchor = _generate_anchors(cas,
                                        types_sorted,
                                        all_feature_structures_by_type,
                                        indexed_feature_structures,
                                        mark_indexed=mark_indexed)

    if not out:
        out = StringIO()

    csv_writer = csv.writer(out, dialect=csv.unix_dialect)
    for t in types_sorted:
        if exclude_types and t in exclude_types:
            continue

        type_ = cas.typesystem.get_type(t)

        csv_writer.writerow([type_.name])

        is_annotation_type = covered_text and cas.typesystem.subsumes(
            parent=TYPE_NAME_ANNOTATION, child=type_)
        csv_writer.writerow(
            _render_header(type_, covered_text=is_annotation_type))

        feature_structures_of_type = all_feature_structures_by_type.get(
            type_.name)

        if not feature_structures_of_type:
            continue

        for fs in feature_structures_of_type:
            row_data = _render_feature_structure(
                type_,
                fs,
                fs_id_to_anchor,
                max_covered_text=30 if is_annotation_type else 0)
            csv_writer.writerow(row_data)

    return out.getvalue() or None
Ejemplo n.º 24
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):

        sentences = cas.select(SENTENCE_TYPE)

        src_tokens = cas.select_covered("webanno.custom.Base", sentences[0])
        trg_tokens = cas.select_covered("webanno.custom.Base", sentences[1])

        src_sentence = [e.get_covered_text() for e in src_tokens]
        trg_sentence = [e.get_covered_text() for e in trg_tokens]

        print(src_sentence)
        print(trg_sentence)

        alignments = self._aligner.get_word_aligns(src_sentence, trg_sentence)

        Relation = cas.typesystem.get_type(layer)
        print(list(Relation.all_features))

        for matching_method in alignments:
            for source_idx, target_idx in alignments[matching_method]:
                src = src_tokens[source_idx]
                target = trg_tokens[target_idx]
                prediction = Relation(
                    Governor=src,
                    Dependent=target,
                    begin=target.begin,
                    end=target.end,
                    inception_internal_predicted=True,
                )
                # setattr(prediction, feature, f"{src.get_covered_text()} -> {target.get_covered_text()}")
                setattr(prediction, feature, "")
                print(source_idx, target_idx, prediction)

                cas.add_annotation(prediction)
            break
Ejemplo n.º 25
0
    def extract(self, cas: Cas) -> (str, float):
        view = cas.get_view(self.view_name)
        dep_matches = len(list(view.select(TripleOverlap.DEP_MAPPING_TYPE)))
        dep_rels = 0

        for d in view.select(FeatureExtractor.DEPENDENCY_TYPE):
            if d.DependencyType in self.english_arg_rels:
                dep_rels += 1

        ret = -1.0
        if not dep_rels:
            ret = 0.0
        else:
            ret = dep_matches / dep_rels

        return (self.view_name + "-Triple-Overlap", ret)
Ejemplo n.º 26
0
def test_import_cas(document_collection, requests_mock):
    requests_mock.post(
        f"{API_BASE}/importer/projects/{PROJECT_NAME}/documentCollections/test-collection/documents",
        json={
            "payload": {
                "original_document_name": "text1.xmi",
                "document_name": "text1.xmi"
            },
            "errorMessages": [],
        },
    )

    cas = Cas(typesystem=TypeSystem())

    result = document_collection.import_documents(cas, filename="text1.xmi")

    assert result[0]["document_name"] == "text1.xmi"
Ejemplo n.º 27
0
    def extract(self, cas: Cas) -> (str, float):
        view = cas.get_view(self.view_name)
        matched_ann = 0
        all_ann = 0
        for a in view.select(self.ann_type):
            all_ann += 1
            mappable = self.get_mappable_ann(view, a)

            if mappable.match:
                matched_ann += 1

        ret = -1.0
        if not all_ann:
            ret = 0.0
        else:
            ret = matched_ann / all_ann

        return (self.view_name + "-" + uima.simple_type_name(self.ann_type) +
                "-Overlap", ret)
Ejemplo n.º 28
0
    def get_perc_of_mapping_type(self, cas: Cas,
                                 alignment: AlignmentLabel) -> float:
        overallMatchesCount = 0
        itemsOfGivenTypeCount = 0
        for t in cas.select(FeatureExtractor.TOKEN_TYPE):

            item = self.get_mappable_ann(cas, t)

            # check for matches/alignment
            if item.match is not None and item.match.target is not None:
                overallMatchesCount += 1
                # check for types
                if item.match.label == alignment.name:
                    itemsOfGivenTypeCount += 1

        # if nothing has been matched at all, the result is 0
        if overallMatchesCount == 0:
            return 0.0
        else:
            return itemsOfGivenTypeCount / overallMatchesCount
Ejemplo n.º 29
0
def load_newsgroup_training_data() -> List[TrainingDocument]:
    twenty_train = fetch_20newsgroups(subset="train", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42)
    target_names = twenty_train.target_names

    typesystem = build_typesystem()
    SentenceType = typesystem.get_type(SENTENCE_TYPE)
    PredictedType = typesystem.get_type(PREDICTED_TYPE)

    docs = []
    for i, (text, target) in enumerate(zip(twenty_train.data, twenty_train.target)):
        cas = Cas(typesystem=typesystem)
        cas.sofa_string = text

        begin = 0
        end = len(text)
        cas.add_annotation(SentenceType(begin=begin, end=end))
        cas.add_annotation(PredictedType(begin=begin, end=end, value=target_names[target]))

        doc = TrainingDocument(cas, f"doc_{i}", USER)
        docs.append(doc)

    return docs
Ejemplo n.º 30
0
def write_sentence_documents(sentences: List[str],
                             labels: List[str],
                             path: Path,
                             labeled=True):
    typesystem = TypeSystem()
    cas = Cas(typesystem=typesystem)

    SentenceType = typesystem.create_type(
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
    SentimentType = typesystem.create_type("webanno.custom.Sentiment")
    typesystem.add_feature(type_=SentimentType,
                           name="value",
                           rangeTypeName="uima.cas.String")

    cas.sofa_string = " ".join(sentences)

    begin = 0
    for sentence, label in zip(sentences, labels):
        end = begin + len(sentence)
        cas_sentence = SentenceType(begin=begin, end=end)
        sentiment_annotation = SentimentType(begin=begin, end=end, value=label)
        begin = end + 1

        cas.add_annotation(cas_sentence)

        if labeled:
            cas.add_annotation(sentiment_annotation)

    cas.to_xmi(path, pretty_print=True)

    for sentence in cas.select(SENTENCE_TYPE):
        print(cas.get_covered_text(sentence))