def convert_single_file(input_paragraph_list: List[str],
                        output_xmi_file: str) -> None:
    document_text = '\n'.join(input_paragraph_list)

    cas = Cas(typesystem=cassis.load_dkpro_core_typesystem())
    cas.sofa_string = document_text

    print("----")
    print(document_text)
    print("----")

    token_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token')
    paragraph_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph')
    sentence_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence')

    total_doc_offset: int = 0
    for paragraph_str in input_paragraph_list:
        this_paragraph_total_offset = total_doc_offset

        doc: Doc = nlp(paragraph_str)

        for token in doc:
            assert isinstance(token, Token)
            # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space)
            begin: int = total_doc_offset + token.idx
            end: int = total_doc_offset + token.idx + len(token)
            # annotate token -- only if it is not a space!
            if not token.is_space:
                cas.add_annotation(token_type.__call__(begin=begin, end=end))

        total_doc_offset += len(paragraph_str)

        # annotate paragraph
        this_paragraph_annotation = paragraph_type.__call__(
            begin=this_paragraph_total_offset, end=total_doc_offset)
        cas.add_annotation(this_paragraph_annotation)
        # and for paragraph too; but how about the '\n' char? maybe +1?
        total_doc_offset += 1

        # add sentences aligned exactly to paragraphs
        cas.add_annotation(
            sentence_type.__call__(begin=this_paragraph_annotation.begin,
                                   end=this_paragraph_annotation.end))

    print([x.get_covered_text() for x in cas.select(paragraph_type.name)])
    print([x.get_covered_text() for x in cas.select(sentence_type.name)])
    print([x.get_covered_text() for x in cas.select(token_type.name)])

    # create parent folder if not exists
    Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True)

    cas.to_xmi(output_xmi_file)
def write_sentence_documents(sentences: List[str],
                             labels: List[str],
                             path: Path,
                             labeled=True):
    typesystem = TypeSystem()
    cas = Cas(typesystem=typesystem)

    SentenceType = typesystem.create_type(
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
    SentimentType = typesystem.create_type("webanno.custom.Sentiment")
    typesystem.add_feature(type_=SentimentType,
                           name="value",
                           rangeTypeName="uima.cas.String")

    cas.sofa_string = " ".join(sentences)

    begin = 0
    for sentence, label in zip(sentences, labels):
        end = begin + len(sentence)
        cas_sentence = SentenceType(begin=begin, end=end)
        sentiment_annotation = SentimentType(begin=begin, end=end, value=label)
        begin = end + 1

        cas.add_annotation(cas_sentence)

        if labeled:
            cas.add_annotation(sentiment_annotation)

    cas.to_xmi(path, pretty_print=True)

    for sentence in cas.select(SENTENCE_TYPE):
        print(cas.get_covered_text(sentence))
 def _generate_candidates(self, cas: Cas, n: int):
     # We generate token n-grams
     for tokens in mit.windowed(cas.select(TOKEN_TYPE), n):
         begin = tokens[0].begin
         end = tokens[-1].end
         text = cas.sofa_string[begin:end]
         yield (begin, end, text)
Example #4
0
    def iter_tokens(self, cas: Cas) -> Iterator[FeatureStructure]:
        """ Returns an iterator over all tokens in the given document.

        Args:
            cas:

        Returns:

        """
        return cas.select(TOKEN_TYPE)
Example #5
0
    def iter_sentences(self, cas: Cas) -> Iterator[FeatureStructure]:
        """ Returns an iterator over all sentences in the given document.

        Args:
            cas:

        Returns:

        """
        return cas.select(SENTENCE_TYPE)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        words = [
            cas.get_covered_text(cas_token)
            for cas_token in cas.select(TOKEN_TYPE)
        ]

        doc = Doc(self._model.vocab, words=words)

        # Get the pos tags
        self._model.get_pipe("tok2vec")(doc)
        self._model.get_pipe("tagger")(doc)

        # For every token, extract the POS tag and create an annotation in the CAS
        for cas_token, spacy_token in zip(cas.select(TOKEN_TYPE), doc):
            prediction = create_prediction(cas, layer, feature,
                                           cas_token.begin, cas_token.end,
                                           spacy_token.tag_)
            cas.add_annotation(prediction)
Example #7
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        stemmer = nltk.PorterStemmer()

        # For every token, steam it and create an annotation in the CAS
        for cas_token in cas.select(TOKEN_TYPE):
            stem = stemmer.stem(cas_token.get_covered_text())
            begin = cas_token.begin
            end = begin + len(stem)
            prediction = create_prediction(cas, layer, feature, begin, end,
                                           stem)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        for i, sentence in enumerate(cas.select(SENTENCE_TYPE)):
            token_ids = self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(sentence.get_covered_text()))
            input_tensor = torch.tensor([token_ids])

            # predict output tensor
            outputs = self._model(input_tensor, adapter_names=[self._adapter_internal_name])

            # retrieve the predicted class label
            label_id = torch.argmax(outputs[0]).item()
            label = self._label_map[label_id]
            prediction = self.create_prediction(cas, layer, feature, sentence.begin, sentence.end, label)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        model: Optional[Pipeline] = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        for sentence in cas.select(SENTENCE_TYPE):
            predicted = model.predict([sentence.get_covered_text()])[0]
            prediction = create_prediction(cas, layer, feature, sentence.begin,
                                           sentence.end, predicted)
            cas.add_annotation(prediction)
Example #10
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        model = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        featurizer = self._get_featurizer()
        sentences = cas.select(SENTENCE_TYPE)
        featurized_sentences = featurizer.featurize([s.get_covered_text() for s in sentences])
        predictions = model.predict(featurized_sentences)

        for sentence, featurized_sentence, label in zip(sentences, featurized_sentences, predictions):
            prediction = create_prediction(cas, layer, feature, sentence.begin, sentence.end, label)
            cas.add_annotation(prediction)
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str, document_id: str, user_id: str):
        for sentence in cas.select(SENTENCE_TYPE):
            cas_tokens = list(cas.select_covered(TOKEN_TYPE, sentence))
            tokens = [t.get_covered_text() for t in cas_tokens]

            grouped_bert_tokens = self._tokenize_bert(tokens)
            predictions = self._predict(grouped_bert_tokens)

            grouped_predictions = self._align_tokens(tokens, grouped_bert_tokens, predictions)

            for token, grouped_prediction in zip(cas_tokens, grouped_predictions):
                begin = token.begin
                end = token.end
                label = Counter([self._label_map[pred] for pred in grouped_prediction]).most_common(1)[0][0]
                prediction = self.create_prediction(cas, layer, feature, begin, end, label)
                cas.add_annotation(prediction)
def featurize_cas(fg: FeatureGenerator, cas: Cas) -> List:
    features = get_features()

    results = []

    for qid, entity in enumerate(cas.select("webanno.custom.EntityLinking")):
        candidates = list(
            cas.select_covered("inception.internal.KbHandle", entity))

        if len(candidates) == 0:
            continue

        for i, candidate in enumerate(candidates):
            if entity.iri == candidate.iri:
                gold_idx = i
                break
        else:
            continue

        sentences = list(
            cas.select_covering(
                "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
                entity))
        assert len(sentences) == 1
        sentence = sentences[0]

        mention = entity.get_covered_text().lower()
        context = sentence.get_covered_text().lower()
        l = len(context)
        # context = context[int(l * 0.25):int(l * 0.75)]

        for cid, candidate in enumerate(candidates):
            score = float(entity.iri == candidate.iri)
            query = candidate.query
            label = candidate.label.lower()

            result = fg.featurize_candidate(qid, cid, "inception_rank", score,
                                            mention, context, label or "",
                                            candidate.description or "",
                                            entity.iri, gold_idx,
                                            candidate.iri, features)

            result.update(fg.featurize_query(mention, query, label))

            results.append(result)

    return results
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        model: Optional[sklearn_crfsuite.CRF] = self._load_model(user_id)

        if model is None:
            logger.debug("No trained model ready yet!")
            return

        all_tokens = []
        featurized_sentences = []

        for sentence in cas.select(SENTENCE_TYPE):
            tokens = list(cas.select_covered(TOKEN_TYPE, sentence))
            words = [token.get_covered_text() for token in tokens]

            all_tokens.append(tokens)
            featurized_sentences.append(self._sent2features(words))

        all_predictions = model.predict(featurized_sentences)

        assert len(all_predictions) == len(all_tokens)
        for predictions, tokens in zip(all_predictions, all_tokens):
            assert len(predictions) == len(tokens)

            begin = None
            end = None
            prev_tag = "O"
            for tag, token in zip(predictions, tokens):
                if begin is not None and end is not None:
                    if tag == "O" or (tag.startswith("B")
                                      and prev_tag.startswith("I")):
                        prediction = create_prediction(cas, layer, feature,
                                                       begin, end, "X")
                        cas.add_annotation(prediction)

                if tag.startswith("B"):
                    begin = token.begin
                    end = token.end
                elif tag.startswith("I"):
                    end = token.end
                else:
                    begin = None
                    end = None

                prev_tag = tag
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):
        # Extract the tokens from the CAS and create a spacy doc from it
        cas_tokens = cas.select(TOKEN_TYPE)
        words = [cas.get_covered_text(cas_token) for cas_token in cas_tokens]

        doc = Doc(self._model.vocab, words=words)

        # Find the named entities
        self._model.get_pipe("ner")(doc)

        # For every entity returned by spacy, create an annotation in the CAS
        for named_entity in doc.ents:
            begin = cas_tokens[named_entity.start].begin
            end = cas_tokens[named_entity.end - 1].end
            label = named_entity.label_
            prediction = create_prediction(cas, layer, feature, begin, end,
                                           label)
            cas.add_annotation(prediction)
Example #15
0
    def get_perc_of_mapping_type(self, cas: Cas,
                                 alignment: AlignmentLabel) -> float:
        overallMatchesCount = 0
        itemsOfGivenTypeCount = 0
        for t in cas.select(FeatureExtractor.TOKEN_TYPE):

            item = self.get_mappable_ann(cas, t)

            # check for matches/alignment
            if item.match is not None and item.match.target is not None:
                overallMatchesCount += 1
                # check for types
                if item.match.label == alignment.name:
                    itemsOfGivenTypeCount += 1

        # if nothing has been matched at all, the result is 0
        if overallMatchesCount == 0:
            return 0.0
        else:
            return itemsOfGivenTypeCount / overallMatchesCount
Example #16
0
    def predict(self, cas: Cas, layer: str, feature: str, project_id: str,
                document_id: str, user_id: str):

        sentences = cas.select(SENTENCE_TYPE)

        src_tokens = cas.select_covered("webanno.custom.Base", sentences[0])
        trg_tokens = cas.select_covered("webanno.custom.Base", sentences[1])

        src_sentence = [e.get_covered_text() for e in src_tokens]
        trg_sentence = [e.get_covered_text() for e in trg_tokens]

        print(src_sentence)
        print(trg_sentence)

        alignments = self._aligner.get_word_aligns(src_sentence, trg_sentence)

        Relation = cas.typesystem.get_type(layer)
        print(list(Relation.all_features))

        for matching_method in alignments:
            for source_idx, target_idx in alignments[matching_method]:
                src = src_tokens[source_idx]
                target = trg_tokens[target_idx]
                prediction = Relation(
                    Governor=src,
                    Dependent=target,
                    begin=target.begin,
                    end=target.end,
                    inception_internal_predicted=True,
                )
                # setattr(prediction, feature, f"{src.get_covered_text()} -> {target.get_covered_text()}")
                setattr(prediction, feature, "")
                print(source_idx, target_idx, prediction)

                cas.add_annotation(prediction)
            break