class TestSoMaJo(unittest.TestCase): def setUp(self): """Necessary preparations""" self.tokenizer = SoMaJo("de_CMC") def _equal_text(self, paragraphs, tokenized_sentences, parallel=1): sentences = self.tokenizer.tokenize_text(paragraphs, parallel=parallel) sentences = [[t.text for t in s] for s in sentences] self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) def _equal_text_file_single_newlines(self, paragraphs, tokenized_sentences, parallel=1): pseudofile = io.StringIO("\n".join(paragraphs)) sentences = self.tokenizer.tokenize_text_file( pseudofile, paragraph_separator="single_newlines", parallel=parallel) sentences = [[t.text for t in s] for s in sentences] self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) def _equal_text_file_empty_lines(self, paragraphs, tokenized_sentences, parallel=1): pseudofile = io.StringIO("\n\n".join(paragraphs)) sentences = self.tokenizer.tokenize_text_file( pseudofile, paragraph_separator="empty_lines", parallel=parallel) sentences = [[t.text for t in s] for s in sentences] self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) def _equal_xml(self, xml, tokenized_sentences, parallel=1): eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() sentences = self.tokenizer.tokenize_xml(xml, eos_tags, parallel=parallel) sentences = [[t.text for t in s] for s in sentences] self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences]) def _equal_xml_file(self, xml, tokenized_sentences, parallel=1): eos_tags = "title h1 h2 h3 h4 h5 h6 p br hr div ol ul dl table".split() pseudofile = io.StringIO(xml) sentences = self.tokenizer.tokenize_xml_file(pseudofile, eos_tags, parallel=parallel) sentences = [[t.text for t in s] for s in sentences] self.assertEqual(sentences, [ts.split() for ts in tokenized_sentences])
for doc in candidate_docs: answers.append({ "id": doc.id, "metaName": doc.meta["name"], "text": doc.text, "score": doc.query_score }) return json.dumps(answers) data_path = "./kbQA/data/MLQA_V1" reader_model_name_full = "mrm8488/bert-multi-cased-finetuned-xquadv1" reader_model_name = reader_model_name_full.split("/")[1] retriever_model_name_full = "distiluse-base-multilingual-cased" retriever_model_type = "sentence_transformers" somajo = SoMaJo("de_CMC", split_camel_case=False, split_sentences=True) def sentence_segmentation(text: str) -> List[str]: # Aufteilen der Texte in Sätze if not text.strip(): return [] tokenized_sentences = somajo.tokenize_text([text]) sents = [] # Somajo generetates tokens. We need sentences instead. Thus we concatenate # the tokens back to sentences and use somajo as a sentence splitter for token_sent in tokenized_sentences: sent = [] for token in token_sent: word = token.text
def setUp(self): """Necessary preparations""" self.tokenizer = SoMaJo("de_CMC")
def setUp(self): """Necessary preparations""" self.tokenizer = SoMaJo("de_CMC", split_sentences=False)