def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
             document_id: str, user_id: str):
     result = jieba.tokenize(cas.sofa_string)
     for tk in result:
         prediction = self.create_prediction(cas, layer, feature, tk[1],
                                             tk[2], tk[0])
         cas.add_annotation(prediction)
Ejemplo n.º 2
0
def write_sentence_documents(sentences: List[str],
                             labels: List[str],
                             path: Path,
                             labeled=True):
    typesystem = TypeSystem()
    cas = Cas(typesystem=typesystem)

    SentenceType = typesystem.create_type(
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
    SentimentType = typesystem.create_type("webanno.custom.Sentiment")
    typesystem.add_feature(type_=SentimentType,
                           name="value",
                           rangeTypeName="uima.cas.String")

    cas.sofa_string = " ".join(sentences)

    begin = 0
    for sentence, label in zip(sentences, labels):
        end = begin + len(sentence)
        cas_sentence = SentenceType(begin=begin, end=end)
        sentiment_annotation = SentimentType(begin=begin, end=end, value=label)
        begin = end + 1

        cas.add_annotation(cas_sentence)

        if labeled:
            cas.add_annotation(sentiment_annotation)

    cas.to_xmi(path, pretty_print=True)

    for sentence in cas.select(SENTENCE_TYPE):
        print(cas.get_covered_text(sentence))
Ejemplo n.º 3
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        stemmer = nltk.PorterStemmer()

        # For every token, steam it and create an annotation in the CAS
        for cas_token in self.iter_tokens(cas):
            stem = stemmer.stem(cas_token.get_covered_text())
            begin = cas_token.begin
            end = begin + len(stem)
            prediction = self.create_prediction(cas, layer, feature, begin, end, stem)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        for i, sentence in enumerate(cas.select(SENTENCE_TYPE)):
            token_ids = self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(sentence.get_covered_text()))
            input_tensor = torch.tensor([token_ids])

            # predict output tensor
            outputs = self._model(input_tensor, adapter_names=[self._adapter_internal_name])

            # retrieve the predicted class label
            label_id = torch.argmax(outputs[0]).item()
            label = self._label_map[label_id]
            prediction = self.create_prediction(cas, layer, feature, sentence.begin, sentence.end, label)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        model: Optional[Pipeline] = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        for sentence in cas.select(SENTENCE_TYPE):
            predicted = model.predict([sentence.get_covered_text()])[0]
            prediction = create_prediction(cas, layer, feature, sentence.begin,
                                           sentence.end, predicted)
            cas.add_annotation(prediction)
Ejemplo n.º 6
0
def rebuilt2xmi(ci,
                output_dir,
                typesystem_path,
                iiif_mappings,
                pct_coordinates=False) -> str:
    """
    Converts a rebuilt ContentItem into Apache UIMA/XMI format.

    The resulting file will be named after the content item's ID, adding
    the `.xmi` extension.

    :param ci: the content item to be converted
    :type ci: `impresso_commons.classes.ContentItem`
    :param output_dir: the path to the output directory
    :type output_dir: str
    :param typesystem_path: TypeSystem file containing defitions of annotation
    layers.
    :type typesystem_path: str
    """

    with open(typesystem_path, "rb") as f:
        typesystem = load_typesystem(f)

    cas = Cas(typesystem=typesystem)
    cas.sofa_string = ci.fulltext
    cas.sofa_mime = 'text/plain'

    sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
    imgLinkType = 'webanno.custom.ImpressoImages'
    Sentence = typesystem.get_type(sentType)
    ImageLink = typesystem.get_type(imgLinkType)

    # create sentence-level annotations
    start_offset = 0
    for break_offset in ci.lines:
        start = start_offset
        end = break_offset
        start_offset = break_offset
        cas.add_annotation(Sentence(begin=start, end=end))

    iiif_links = compute_image_links(ci,
                                     iiif_links=iiif_mappings,
                                     pct=pct_coordinates)

    # inject the IIIF links into
    for iiif_link, start, end in iiif_links:
        cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link))

    outfile_path = os.path.join(output_dir, f'{ci.id}.xmi')
    cas.to_xmi(outfile_path, pretty_print=True)
    return outfile_path
Ejemplo n.º 7
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        model = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        featurizer = self._get_featurizer()
        sentences = cas.select(SENTENCE_TYPE)
        featurized_sentences = featurizer.featurize([s.get_covered_text() for s in sentences])
        predictions = model.predict(featurized_sentences)

        for sentence, featurized_sentence, label in zip(sentences, featurized_sentences, predictions):
            prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        for sentence in cas.select(SENTENCE_TYPE):
            cas_tokens = list(cas.select_covered(TOKEN_TYPE, sentence))
            tokens = [t.get_covered_text() for t in cas_tokens]

            grouped_bert_tokens = self._tokenize_bert(tokens)
            predictions = self._predict(grouped_bert_tokens)

            grouped_predictions = self._align_tokens(tokens, grouped_bert_tokens, predictions)

            for token, grouped_prediction in zip(cas_tokens, grouped_predictions):
                begin = token.begin
                end = token.end
                label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0]
                prediction = self.create_prediction(cas, layer, feature, begin, end, label)
                cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        model: Optional[sklearn_crfsuite.CRF] = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        all_tokens = []
        featurized_sentences = []

        for sentence in cas.select(SENTENCE_TYPE):
            tokens = list(cas.select_covered(TOKEN_TYPE, sentence))
            words = [token.get_covered_text() for token in tokens]

            all_tokens.append(tokens)
            featurized_sentences.append(self._sent2features(words))

        all_predictions = model.predict(featurized_sentences)

        assert len(all_predictions) == len(all_tokens)
        for predictions, tokens in zip(all_predictions, all_tokens):
            assert len(predictions) == len(tokens)

            begin = None
            end = None
            prev_tag = "O"
            for tag, token in zip(predictions, tokens):
                if begin is not None and end is not None:
                    if tag == "O" or (tag.startswith("B")
                                      and prev_tag.startswith("I")):
                        prediction = create_prediction(cas, layer, feature,
                                                       begin, end, "X")
                        cas.add_annotation(prediction)

                if tag.startswith("B"):
                    begin = token.begin
                    end = token.end
                elif tag.startswith("I"):
                    end = token.end
                else:
                    begin = None
                    end = None

                prev_tag = tag
Ejemplo n.º 10
0
def load_newsgroup_test_data() -> List[Cas]:
    twenty_test = fetch_20newsgroups(subset="test", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42)

    typesystem = build_typesystem()
    SentenceType = typesystem.get_type(SENTENCE_TYPE)

    result = []
    for text in twenty_test.data[:5]:
        cas = Cas(typesystem=typesystem)
        cas.sofa_string = text

        begin = 0
        end = len(text)
        cas.add_annotation(SentenceType(begin=begin, end=end))

        result.append(cas)

    return result
Ejemplo n.º 11
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        model = self._load_model(user_id)

        if model is None:
            return

        le, items = model

        m = Map.from_iter(items)

        # We iterate over the all candidates and check whether they match
        for (begin, end, term) in chain(
            self._generate_candidates(cas, 3), self._generate_candidates(cas, 2), self._generate_candidates(cas, 1)
        ):
            for mention, label_id in m.search(term=term, max_dist=2):
                label = le.inverse_transform([label_id])[0]
                prediction = create_prediction(cas, layer, feature, begin, end, label)
                cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        cas_tokens = cas.select(TOKEN_TYPE)
        words = [cas.get_covered_text(cas_token) for cas_token in cas_tokens]

        doc = Doc(self._model.vocab, words=words)

        # Find the named entities
        self._model.get_pipe("ner")(doc)

        # For every entity returned by spacy, create an annotation in the CAS
        for named_entity in doc.ents:
            begin = cas_tokens[named_entity.start].begin
            end = cas_tokens[named_entity.end - 1].end
            label = named_entity.label_
            prediction = create_prediction(cas, layer, feature, begin, end,
                                           label)
            cas.add_annotation(prediction)
Ejemplo n.º 13
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        words = [
            cas.get_covered_text(cas_token)
            for cas_token in self.iter_tokens(cas)
        ]

        doc = Doc(self._model.vocab, words=words)

        # Find the named entities
        self._model.tagger(doc)

        # For every token, extract the POS tag and create an annotation in the CAS
        for cas_token, spacy_token in zip(self.iter_tokens(cas), doc):
            prediction = self.create_prediction(cas, layer, feature,
                                                cas_token.begin, cas_token.end,
                                                spacy_token.pos_)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        words = [
            cas.get_covered_text(cas_token)
            for cas_token in cas.select(TOKEN_TYPE)
        ]

        doc = Doc(self._model.vocab, words=words)

        # Get the pos tags
        self._model.get_pipe("tok2vec")(doc)
        self._model.get_pipe("tagger")(doc)

        # For every token, extract the POS tag and create an annotation in the CAS
        for cas_token, spacy_token in zip(cas.select(TOKEN_TYPE), doc):
            prediction = create_prediction(cas, layer, feature,
                                           cas_token.begin, cas_token.end,
                                           spacy_token.tag_)
            cas.add_annotation(prediction)
Ejemplo n.º 15
0
def load_newsgroup_training_data() -> List[TrainingDocument]:
    twenty_train = fetch_20newsgroups(subset="train", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42)
    target_names = twenty_train.target_names

    typesystem = build_typesystem()
    SentenceType = typesystem.get_type(SENTENCE_TYPE)
    PredictedType = typesystem.get_type(PREDICTED_TYPE)

    docs = []
    for i, (text, target) in enumerate(zip(twenty_train.data, twenty_train.target)):
        cas = Cas(typesystem=typesystem)
        cas.sofa_string = text

        begin = 0
        end = len(text)
        cas.add_annotation(SentenceType(begin=begin, end=end))
        cas.add_annotation(PredictedType(begin=begin, end=end, value=target_names[target]))

        doc = TrainingDocument(cas, f"doc_{i}", USER)
        docs.append(doc)

    return docs
Ejemplo n.º 16
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):

        sentences = cas.select(SENTENCE_TYPE)

        src_tokens = cas.select_covered("webanno.custom.Base", sentences[0])
        trg_tokens = cas.select_covered("webanno.custom.Base", sentences[1])

        src_sentence = [e.get_covered_text() for e in src_tokens]
        trg_sentence = [e.get_covered_text() for e in trg_tokens]

        print(src_sentence)
        print(trg_sentence)

        alignments = self._aligner.get_word_aligns(src_sentence, trg_sentence)

        Relation = cas.typesystem.get_type(layer)
        print(list(Relation.all_features))

        for matching_method in alignments:
            for source_idx, target_idx in alignments[matching_method]:
                src = src_tokens[source_idx]
                target = trg_tokens[target_idx]
                prediction = Relation(
                    Governor=src,
                    Dependent=target,
                    begin=target.begin,
                    end=target.end,
                    inception_internal_predicted=True,
                )
                # setattr(prediction, feature, f"{src.get_covered_text()} -> {target.get_covered_text()}")
                setattr(prediction, feature, "")
                print(source_idx, target_idx, prediction)

                cas.add_annotation(prediction)
            break
def convert_single_file(input_paragraph_list: List[str],
                        output_xmi_file: str) -> None:
    document_text = '\n'.join(input_paragraph_list)

    cas = Cas(typesystem=cassis.load_dkpro_core_typesystem())
    cas.sofa_string = document_text

    print("----")
    print(document_text)
    print("----")

    token_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token')
    paragraph_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph')
    sentence_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence')

    total_doc_offset: int = 0
    for paragraph_str in input_paragraph_list:
        this_paragraph_total_offset = total_doc_offset

        doc: Doc = nlp(paragraph_str)

        for token in doc:
            assert isinstance(token, Token)
            # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space)
            begin: int = total_doc_offset + token.idx
            end: int = total_doc_offset + token.idx + len(token)
            # annotate token -- only if it is not a space!
            if not token.is_space:
                cas.add_annotation(token_type.__call__(begin=begin, end=end))

        total_doc_offset += len(paragraph_str)

        # annotate paragraph
        this_paragraph_annotation = paragraph_type.__call__(
            begin=this_paragraph_total_offset, end=total_doc_offset)
        cas.add_annotation(this_paragraph_annotation)
        # and for paragraph too; but how about the '\n' char? maybe +1?
        total_doc_offset += 1

        # add sentences aligned exactly to paragraphs
        cas.add_annotation(
            sentence_type.__call__(begin=this_paragraph_annotation.begin,
                                   end=this_paragraph_annotation.end))

    print([x.get_covered_text() for x in cas.select(paragraph_type.name)])
    print([x.get_covered_text() for x in cas.select(sentence_type.name)])
    print([x.get_covered_text() for x in cas.select(token_type.name)])

    # create parent folder if not exists
    Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True)

    cas.to_xmi(output_xmi_file)
def export_annotated_texts_to_xmi(annotated_texts: List[AnnotatedText],
                                  type_system,
                                  file: str,
                                  xmi_file=None):

    cas = Cas(typesystem=type_system)

    current_start = 0
    starts = []
    sofa_string = ''

    # Create sofa string
    for annotated_text in annotated_texts:
        starts.append(current_start)
        text = annotated_text.text
        if not text.endswith('\n'):
            text += '\n'
        sofa_string += text
        current_start += len(text)

    cas.sofa_string = sofa_string

    # Tokens
    for annotated_text, start in zip(annotated_texts, starts):
        for token in annotated_text.tokens:
            annotation = cas.typesystem.get_type(TOKEN_NS)(
                begin=start + token.start, end=start + token.stop)
            cas.add_annotation(annotation)

    # Sentences
    for annotated_text, start in zip(annotated_texts, starts):
        annotation = cas.typesystem.get_type(SENTENCE_NS)(
            begin=start, end=start + len(annotated_text.text))
        cas.add_annotation(annotation)

    # Annotations
    for annotated_text, start in zip(annotated_texts, starts):
        for annotation in annotated_text.annotations:
            annotation = cas.typesystem.get_type(NAMED_ENTITY_NS)(
                value=annotation.label,
                begin=start + annotation.start,
                end=start + annotation.stop)
            cas.add_annotation(annotation)

    # write
    with open(file, 'wb') as f:
        dump_cas_to_zip_file(cas, f, xmi_file=xmi_file)