Exemple #1
0
class TestSoMaJo(unittest.TestCase):
    def setUp(self):
        """Necessary preparations"""
        self.tokenizer = SoMaJo("de_CMC")

    def _equal_text(self, paragraphs, tokenized_sentences, parallel=1):
        sentences = self.tokenizer.tokenize_text(paragraphs, parallel=parallel)
        sentences = [[t.text for t in s] for s in sentences]
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])

    def _equal_text_file_single_newlines(self,
                                         paragraphs,
                                         tokenized_sentences,
                                         parallel=1):
        pseudofile = io.StringIO("\n".join(paragraphs))
        sentences = self.tokenizer.tokenize_text_file(
            pseudofile,
            paragraph_separator="single_newlines",
            parallel=parallel)
        sentences = [[t.text for t in s] for s in sentences]
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])

    def _equal_text_file_empty_lines(self,
                                     paragraphs,
                                     tokenized_sentences,
                                     parallel=1):
        pseudofile = io.StringIO("\n\n".join(paragraphs))
        sentences = self.tokenizer.tokenize_text_file(
            pseudofile, paragraph_separator="empty_lines", parallel=parallel)
        sentences = [[t.text for t in s] for s in sentences]
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])

    def _equal_xml(self, xml, tokenized_sentences, parallel=1):
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        sentences = self.tokenizer.tokenize_xml(xml,
                                                eos_tags,
                                                parallel=parallel)
        sentences = [[t.text for t in s] for s in sentences]
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])

    def _equal_xml_file(self, xml, tokenized_sentences, parallel=1):
        eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split()
        pseudofile = io.StringIO(xml)
        sentences = self.tokenizer.tokenize_xml_file(pseudofile,
                                                     eos_tags,
                                                     parallel=parallel)
        sentences = [[t.text for t in s] for s in sentences]
        self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
Exemple #2
0
    for doc in candidate_docs:
        answers.append({
            "id": doc.id,
            "metaName": doc.meta["name"],
            "text": doc.text,
            "score": doc.query_score
        })
    return json.dumps(answers)


data_path = "./kbQA/data/MLQA_V1"
reader_model_name_full = "mrm8488/bert-multi-cased-finetuned-xquadv1"
reader_model_name = reader_model_name_full.split("/")[1]
retriever_model_name_full = "distiluse-base-multilingual-cased"
retriever_model_type = "sentence_transformers"
somajo = SoMaJo("de_CMC", split_camel_case=False, split_sentences=True)


def sentence_segmentation(text: str) -> List[str]:
    # Aufteilen der Texte in Sätze
    if not text.strip():
        return []
    tokenized_sentences = somajo.tokenize_text([text])
    sents = []
    # Somajo generetates tokens. We need sentences instead. Thus we concatenate
    # the tokens back to sentences and use somajo as a sentence splitter
    for token_sent in tokenized_sentences:
        sent = []
        for token in token_sent:
            word = token.text
Exemple #3
0
 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = SoMaJo("de_CMC")
Exemple #4
0
 def setUp(self):
     """Necessary preparations"""
     self.tokenizer = SoMaJo("de_CMC", split_sentences=False)