def variants(utt): """ Convert strings that are variants in the utterance. :param utt: (list) """ c = " ".join(utt) c = c.replace('{ ', '{') c = c.replace(' }', '}') c = c.replace(' | ', '|') inside = False cc = u("") for i, character in enumerate(c): if character == "{": inside = True elif character == "}": inside = False if inside is True: if character == " ": cc += u("_") else: cc += character else: cc += character return cc.split()
def test_unicode(self): meta = sppasMetaInfo() meta.add_metainfo('éè', 'moi') self.assertEqual(len(meta.keys_enabled()), 1) self.assertTrue(u("éè") in meta.keys_enabled()) self.assertEqual(meta.get_metainfo('éè'), 'moi') self.assertEqual(meta.get_metainfo(u('éè')), 'moi')
def test_parse_annotation_short(self): """Test the parsing of an annotation.""" ann_content = '0.0\n' \ '2.4971007546\n' \ '"gpf_0"\n' lines = ann_content.split("\n") ann, nb = sppasTextGrid._parse_annotation(lines, 0, True) self.assertEqual(nb, 3) self.assertEqual( sppasInterval(sppasTextGrid.make_point(0.), sppasTextGrid.make_point(2.4971007546)), ann.get_location().get_best()) self.assertEqual(sppasTag("gpf_0"), ann.get_labels()[0].get_best()) ann_content = '2.4971007546\n' \ '5.6838880379\n' \ '"hier soir j\'ai ouvert la \n'\ 'porte d\'entrée pour laisser chort- sortir le ""chat"""\n' lines = ann_content.split("\n") ann, nb = sppasTextGrid._parse_annotation(lines, 0, True) self.assertEqual(nb, 4) self.assertEqual( sppasInterval(sppasTextGrid.make_point(2.4971007546), sppasTextGrid.make_point(5.6838880379)), ann.get_location().get_best()) self.assertEqual(u('hier soir j\'ai ouvert la'), ann.get_labels()[0].get_best().get_content()) self.assertEqual( u('porte d\'entrée pour laisser chort- sortir le "chat"'), ann.get_labels()[1].get_best().get_content())
def format_location(location_root, location): """Add a 'Location' element in the tree from a sppasLocation(). :param location_root: (ET) XML Element tree root. :param location: (sppasLocation) """ for localization, score in location: if localization.is_point(): point_node = ET.SubElement(location_root, 'Point') sppasXRA._format_point(point_node, localization) if score is not None: point_node.set('score', u(str(score))) elif localization.is_interval(): interval_root = ET.SubElement(location_root, 'Interval') sppasXRA._format_interval(interval_root, localization) if score is not None: interval_root.set('score', u(str(score))) elif localization.IsTimeDisjoint(): disjoint_root = ET.SubElement(location_root, 'Disjoint') sppasXRA._format_disjoint(disjoint_root, localization) if score is not None: disjoint_root.set('score', u(str(score)))
def replace(self, utt): """Examine tokens and performs some replacements. A dictionary with symbols contains the replacements to operate. :param utt: (list) the utterance :returns: A list of strings """ # Specific case of float numbers sent = ' '.join(utt) sent = re.sub(u('([0-9])\.([0-9])'), u(r'\1 NUMBER_SEP_POINT \2'), sent) sent = re.sub(u('([0-9])\,([0-9])'), u(r'\1 NUMBER_SEP \2'), sent) sent = sppasUnicode(sent).to_strip() _utt = sent.split() # Other generic replacements _result = list() for s in _utt: if self.repl.is_key(s): s = s.replace(s, self.repl.replace(s)) _result.append(sppasUnicode(s).to_strip()) return _result
def test_parse_text_long(self): """Test text parser.""" # standard tag ann_content = '\t\txmin = 0.0\n' \ '\t\txmax = 2.4971007546\n' \ '\t\ttext = "gpf_0"\n' lines_i = ann_content.split("\n") tag, nb = sppasTextGrid._parse_text(lines_i, 2) self.assertEqual([sppasLabel(sppasTag("gpf_0"))], tag) self.assertEqual(nb, 3) # multi-lines tag ann_content = '\t\txmin = 0.0\n' \ '\t\txmax = 2.4971007546\n' \ '\t\ttext = "hier soir j\'ai ouvert la\n' \ 'porte d\'entrée\n' \ 'pour laisser chort- sortir le ""chat"""\n' lines = ann_content.split("\n") labels, nb = sppasTextGrid._parse_text(lines, 2) self.assertEqual(u('hier soir j\'ai ouvert la'), labels[0].get_best().get_content()) self.assertEqual(u('porte d\'entrée'), labels[1].get_best().get_content()) self.assertEqual(u('pour laisser chort- sortir le "chat"'), labels[2].get_best().get_content()) self.assertEqual(nb, 5) with self.assertRaises(AioLineFormatError): ann_content = '\t\txmin = 0.0\n' \ '\t\txmax = 2.4971007546\n' \ '\t\ttext = "hier soir j\'ai ouvert la\n' \ 'porte d\'entrée\n' lines = ann_content.split("\n") sppasTextGrid._parse_text(lines, 2)
def __milliers_ita(self,number): if number < 1000: return self.centaine(number) # Milliers if number == 1000: return u("mille") if number > 1000 and number < 2000: return u("mille-") + self.centaine(number % 1000) if number >= 2000 and number < 10000: if (number % 1000) != 0: return self.unite(int(number/1000)).strip() + u("-mila-") + self.centaine(number % 1000) return self.unite(int(number/1000)).strip() + u("-mila") # Dizaines de milliers if number == 10000: return u("diecimila") if number > 10000 and number < 100000: if (number % 1000) != 0: return self.dizaine(int(number/1000)).strip() + u("-mila-") + self.centaine(number % 1000) return self.dizaine(int(number/1000)) + u("-mila") # Centaines de milliers if number == 100000: return u("centomila") if number >= 100000 and number < 1000000: if (number % 1000) != 0: return self.centaine(int(number/1000)).strip() + u("-mila-") + self.centaine(int(number%1000)) return self.centaine(int(number/1000)).strip() + u("mila-") return str(number)
def test_load_xml(self): """Load a pronunciation dictionary from a RALF dic file (xml).""" d = sppasDictPron(DICT_TEST_XML) self.assertGreater(len(d), 7500) self.assertEqual(u("y:-p-s-t"), d.get_pron("übst")) self.assertEqual(u("g-e:-s-t-@-n|g-e:-s-t-n"), d.get_pron("Gesten"))
def __milliers_fra(self, number): if number < 1000: return self.centaine(number) # Milliers if number == 1000: return u("mille ") elif number > 1000 and number < 2000: return u("mille-") + self.centaine(number % 1000) elif number >= 2000 and number < 10000: if (number % 1000) == 0: return self.unite(int(number/1000)) + u("-mille-") return self.unite(int(number/1000)) + u("-mille-") + self.centaine(number % 1000) # Dizaines de milliers if number == 10000: return u("dix-mille") elif number > 10000 and number < 100000: if (number%1000) == 0: return self.dizaine(int(number/1000)) + u("-mille ") return self.dizaine(int(number/1000)) + u("-mille-") + self.centaine(number % 1000) # Centaines de milliers if number == 100000: return u("cent-mille") elif number >= 100000 and number < 1000000: if (number % 1000) == 0: return self.centaine(int(number/1000)) + u("-mille ") return self.centaine(int(number/1000)) + u("-mille-") + self.centaine(int(number % 1000)) return str(number)
def __parse_option(items): """ Parse an option. Convert an "Option" section of the parser into an "Option" instance. """ oid = "" otype = "str" ovalue = "" otext = "" for name, value in items: if name == "type": otype = u(value) elif name == "id": oid = u(value) elif name == "value": ovalue = u(value) elif name == "text": otext = u(value) opt = sppasOption(oid) opt.set_type(otype) opt.set_value(ovalue) opt.set_text(otext) return opt
def test_combine_methods(self): self.assertTrue( self.tc.startswith(sppasTag("abc"), u("a")) and self.tc.endswith(sppasTag("abc"), u("c"))) self.assertTrue( self.tc.get("startswith")(sppasTag("abc"), u("a")) and self.tc.get("endswith")(sppasTag("abc"), u("c")))
def __milliers_fra(self, number): if number < 1000: return self.centaine(number) # Milliers if number == 1000: return u("mille ") elif number > 1000 and number < 2000: return u("mille-") + self.centaine(number % 1000) elif number >= 2000 and number < 10000: if (number % 1000) == 0: return self.unite(int(number / 1000)) + u("-mille-") return self.unite(int( number / 1000)) + u("-mille-") + self.centaine(number % 1000) # Dizaines de milliers if number == 10000: return u("dix-mille") elif number > 10000 and number < 100000: if (number % 1000) == 0: return self.dizaine(int(number / 1000)) + u("-mille ") return self.dizaine(int( number / 1000)) + u("-mille-") + self.centaine(number % 1000) # Centaines de milliers if number == 100000: return u("cent-mille") elif number >= 100000 and number < 1000000: if (number % 1000) == 0: return self.centaine(int(number / 1000)) + u("-mille ") return self.centaine(int( number / 1000)) + u("-mille-") + self.centaine( int(number % 1000)) return str(number)
def test_not_tag(self): """Test tag is not matching str.""" tier = self.trs.find('P-Phonemes') f = sppasTierFilters(tier) l = f.tag(exact=u("l")) not_l = f.tag(not_exact=u('l')) self.assertEqual(len(tier), len(l) + len(not_l))
def __dizaine_cmn(self, number): if number < 10: _str = self.unite(number) elif 10 <= number < 100: if (number%10) == 0: _str = self.unite(int(number/10)) + u("十") else: _str = self.unite(int(number/10)) + u("十") + self.unite(number%10) return _str
def _format_point(point_node, point): """Add a 'Point' element in the tree from a sppasPoint(). :param point_node: (ET) XML Element node. :param point: (sppasPoint) """ point_node.set('midpoint', u(str(point.get_midpoint()))) if point.get_radius() is not None: point_node.set('radius', u(str(point.get_radius())))
def test_icontains(self): """ tag contains text (case in-sensitive). """ self.assertTrue(self.tc.icontains(sppasTag("abc"), u("B"))) self.assertFalse(self.tc.icontains(sppasTag("abc"), u("d"))) with self.assertRaises(TypeError): self.tc.icontains("abc", u("B")) with self.assertRaises(TypeError): self.tc.icontains(sppasTag("abc"), b("d"))
def test_toe(self): t = sppasTranscription() s = t.clean_toe(u(" /l-e-f-o~-n/ ")) s = t.toe_spelling(s) self.assertEqual(s, u('/l-e-f-o~-n/')) s = t.clean_toe(u(" /le mot/ ")) s = t.toe_spelling(s) self.assertEqual(s, u('/ le mot /'))
def test_iexact(self): """ tag == text (case in-sensitive). """ self.assertTrue(self.tc.iexact(sppasTag("abc"), u("ABC"))) self.assertFalse(self.tc.iexact(sppasTag("abc"), u("AAA"))) with self.assertRaises(TypeError): self.tc.iexact("abc", u("ABC")) with self.assertRaises(TypeError): self.tc.iexact(sppasTag("abc"), b("ABC"))
def __milliards_spa(self, number): if number < 1000000000: return self.millions(number) n = number / 1000000000 r = number % 1000000000 s = u("%s-mil-millones") % self.millions(n) if r == 0: return s else: return u("%s-%s") % (s, self.millions(r))
def __dizaine_cmn(self, number): if number < 10: _str = self.unite(number) elif 10 <= number < 100: if (number % 10) == 0: _str = self.unite(int(number / 10)) + u("十") else: _str = self.unite(int(number / 10)) + u("十") + self.unite( number % 10) return _str
def test_startswith(self): """ tag startswith text (case sensitive). """ self.assertTrue(self.tc.startswith(sppasTag("abc"), u("a"))) self.assertFalse(self.tc.startswith(sppasTag("abc"), u("b"))) with self.assertRaises(TypeError): self.tc.startswith("abc", u("a")) with self.assertRaises(TypeError): self.tc.startswith(sppasTag("abc"), b("b"))
def test_iendswith(self): """ tag endswith text (case in-sensitive). """ self.assertTrue(self.tc.iendswith(sppasTag("abc"), u("C"))) self.assertFalse(self.tc.iendswith(sppasTag("abc"), u("b"))) with self.assertRaises(TypeError): self.tc.iendswith("abc", u("C")) with self.assertRaises(TypeError): self.tc.iendswith(sppasTag("abc"), b("b"))
def test_toe(self): """... Apply both clean_toe then toe_spelling.""" s = sppasOrthoTranscription().clean_toe(u(" /l-e-f-o~-n/ ")) s = sppasOrthoTranscription().toe_spelling(s) self.assertEqual(u('/l-e-f-o~-n/'), s) s = sppasOrthoTranscription().clean_toe(u(" /le mot/ ")) s = sppasOrthoTranscription().toe_spelling(s) self.assertEqual(u('/ le mot /'), s)
def test_all(self): l = sppasVocabulary(VOCAB, nodump=True) self.assertEqual(len(l), 20) self.assertTrue(l.is_unk('toto')) self.assertFalse(l.is_unk('normale')) self.assertFalse(l.is_unk("isn't")) self.assertFalse(l.is_unk(u("đ"))) l.add("être") self.assertTrue(l.is_in(u("être"))) self.assertTrue(u("être") in l)
def is_enable_metainfo(self, key): """ Return the status of a given key or raise a MetaKeyError exception. :param key: (str) The key of the meta-information """ if u(key) not in self._metainfo: raise MetaKeyError(key) return self._metainfo[u(key)][0]
def get_metainfo(self, key): """ Return the value of a given key or raise a MetaKeyError exception. :param key: (str) The key of the meta-information """ if u(key) not in self._metainfo: raise MetaKeyError(key) return self._metainfo[u(key)][1]
def pop_metainfo(self, key): """ Pop a meta information or raise a MetaKeyError exception. :param key: (str) The key of the meta-information """ if u(key) not in self._metainfo.keys(): raise MetaKeyError(key) del self._metainfo[u(key)]
def __centaine_ita(self, number): if number < 100: return self.dizaine(number) if number == 100: return u("cento") if number > 100 and number <= 199: return u("cento-") + self.dizaine(number%100) if (number%100) == 0: return " " + self.unite(number % 100) + u("-cento") return " " + self.unite(int(number/100)) + u("-cento-") + self.dizaine(number % 100)
def clean_toe(entry): """ Clean Enriched Orthographic Transcription. The convention includes information that must be removed. :param entry: (str) :returns: (str) """ # Proper names: $ name ,P\$ entry = re.sub(u(',\s?[PTS]+\s?[\\/\\\]+\s?\\$'), r'', entry, re.UNICODE) entry = re.sub(u('\$'), r'', entry, re.UNICODE) # Tags of the activity entry = re.sub(u('(gpd_[0-9]+)'), r" ", entry, re.UNICODE) entry = re.sub(u('(gpf_[0-9]+)'), r" ", entry, re.UNICODE) entry = re.sub(u('(ipu_[0-9]+)'), r" ", entry, re.UNICODE) # Remove invalid parenthesis content entry = re.sub(u('\s+\([\w\xaa-\xff]+\)\s+'), ' ', entry, re.UNICODE) entry = re.sub(u('^\([\w\xaa-\xff]+\)\s+'), ' ', entry, re.UNICODE) entry = re.sub(u('\s+\([\w\xaa-\xff]+\)$'), ' ', entry, re.UNICODE) entry = re.sub(u('\s*\[([^,]+),([^,]+)\]'), sppasOrthoTranscription.__replace, entry, re.UNICODE) return " ".join(entry.split())
def enable_metainfo(self, key, value=True): """ Enable/Disable a meta information or raise a MetaKeyError exception. :param key: (str) The key of the meta-information :param value: (bool) Status of the meta-information """ if u(key) not in self._metainfo.keys(): raise MetaKeyError(key) self._metainfo[u(key)][0] = bool(value)
def __centaine_cmn(self, number): if number < 100: return self.dizaine(number) if number >= 100 and number < 1000: if (number % 100) != 0: if (number % 100) > 0 and (number % 100) < 10: return self.dizaine(int(number/100)) + u("百零") + self.dizaine(number % 100) return self.dizaine(int(number/100)) + u("百") + self.dizaine(number % 100) else: return self.dizaine(int(number/100)) + u("百") return str(number)
def __millions_ita(self,number): if number < 1000000: return self.milliers(number) if number >= 1000000 and number < 2000000: return u("un-milione-") + self.milliers(int(number % 1000000)) if number >= 2000000 and number < 1000000000: return self.centaine(int(number/1000000)).strip() + u("-milioni-") + self.milliers(int(number % 1000000)) return str(number)
def __milliers_eng(self, number): if number < 1000: return self.centaine(number) n = number / 1000 r = number % 1000 s = u("%s thousand") % self.centaine(n) if r == 0: return s else: return u("%s %s") % (s, self.centaine(r))
def __milliards_fra(self, number): if number < 1000000000: return self.millions(number) if number >= 1000000000 and number < 2000000000: return u("un-milliard-") + self.millions(int(number % 1000000000)) if number >= 2000000000 and number < 1000000000000: return self.centaine(int(number/1000000000)) + u("-milliards-") + self.millions(int(number % 1000000000)) return str(number)
def __millions_pol(self, number): if number < 1000000: return self.milliers(number) if number >= 1000000 and number < 2000000: return u("milion ") + self.milliers(int(number % 1000000)) if number >= 2000000 and number < 1000000000: return self.centaine(int(number/1000000)).strip() + u(" miliony ") + self.milliers(int(number % 1000000)) return str(number)
def test_split_characters(self): """... Split a character-based string.""" splitter = sppasSimpleSplitter("cmn") result = splitter.split_characters("干脆就把那部蒙人的闲法给废了拉倒") expected = u("干 脆 就 把 那 部 蒙 人 的 闲 法 给 废 了 拉 倒") self.assertEqual(expected, result) result = splitter.split_characters("abc123") expected = u(" abc123 ") self.assertEqual(expected, result)
def __milliards_eng(self, number): if number < 1000000000: return self.millions(number) n = number / 1000000000 r = number % 1000000000 s = u("%s billion") % self.centaine(n) if r == 0: return s else: return u("%s %s") % (s, self.millions(r))
def __milliards_pol(self, number): if number < 1000000000: return self.millions(number) if number >= 1000000000 and number < 2000000000: return u("miliard") + self.millions(int(number % 1000000000)) if number >= 2000000000 and number < 1000000000000: return self.centaine(int(number/1000000000)) + u(" miliardy ") + self.millions(int(number % 1000000000)) return str(number)
def __centaine_ita(self, number): if number < 100: return self.dizaine(number) if number == 100: return u("cento") if number > 100 and number <= 199: return u("cento-") + self.dizaine(number % 100) if (number % 100) == 0: return " " + self.unite(number % 100) + u("-cento") return " " + self.unite(int( number / 100)) + u("-cento-") + self.dizaine(number % 100)
def __centaine_cmn(self, number): if number < 100: return self.dizaine(number) if number >= 100 and number < 1000: if (number % 100) != 0: if (number % 100) > 0 and (number % 100) < 10: return self.dizaine(int( number / 100)) + u("百零") + self.dizaine(number % 100) return self.dizaine(int(number / 100)) + u("百") + self.dizaine( number % 100) else: return self.dizaine(int(number / 100)) + u("百") return str(number)
def __millier_cmn(self, number): if number < 1000: return self.centaine(number) if number >= 1000 and number < 10000: if (number % 1000) != 0: if number % 1000 > 0 \ and number % 1000 < 100: return self.centaine(int(number/1000)) + u("千零") + self.centaine(number % 1000) else: return self.centaine(int(number/1000)) + u("千") + self.centaine(number % 1000) return self.centaine(int(number/1000)) + u("千") return str(number)
def __milliers_pol(self, number): if number < 1000: return self.centaine(number) n = number / 1000 r = number % 1000 if number < 2000: s = u("tysięcy") else: s = u("%s tysiące") % self.centaine(n) if r == 0: return s return u("%s %s") % (s, self.centaine(r))
def __millions_cmn(self, number): if number < 100000000: return self.__milliers_cmn(number) if (number % 100000000) == 0: return self.__millier_cmn(int(number/100000000)) + u("亿") if (number % 100000000) != 0: if (number % 100000000) > 0 and (number % 100000000) < 10000000: return self.__millier_cmn(int(number/100000000)) + u("亿零") + self.__millier_cmn(number % 100000000) return self.__millier_cmn(int(number/100000000)) + u("亿") + self.__millier_cmn(number % 100000000) else: return u("亿")
def __milliers_cmn(self, number): if number < 10000: return self.__millier_cmn(number) if (number % 10000) == 0: return self.unite(int(number/10000)) + u("万") if number >= 10000 and number < 100000000: if (number % 10000) != 0: if (number % 10000) > 0 and (number % 10000) < 1000: return self.unite(int(number/10000)) + u("万零") + self.__millier_cmn(number % 10000) return self.unite(int(number/10000)) + u("万") + self.__millier_cmn(number % 10000) else: return u("万") return str(number)
def __millions_spa(self, number): if number < 1000000: return self.milliers(number) n = number / 1000000 r = number % 1000000 if number < 2000000: s = u("un-millón") else: s = u("%s-millones") % self.milliers(n) if r == 0: return s return u("%s-%s") % (s, self.milliers(r))
def __milliards_ita(self, number): if number < 1000000000: _str = self.millions(number) # Millions elif number >= 1000000000 and number < 2000000000: _str = u("un-miliardo-") + self.millions(int(number % 1000000000)) elif number >= 2000000000 and number < 1000000000000: _str = "-" + self.centaine(int(number/1000000000)).strip() + u("-miliardi-") + self.millions(int(number % 1000000000)) else: return str(number) return _str
def __milliers_spa(self, number): if number < 1000: return self.centaine(number) n = number / 1000 r = number % 1000 if number < 2000: s = u("mil") else: s = u("%s-mil") % self.centaine(n) if r == 0: return s return u("%s-%s") % (s, self.centaine(r))
def test_get_pron(self): d = sppasDictPron(DICT_TEST, nodump=True) self.assertEqual(d.get_pron(u('sil')), "s-i-l") self.assertEqual(d.get_pron(u('azerty')), "<UNK>") self.assertEqual(d.get_pron(u('abc')), "a-b-c|a-c") self.assertEqual(d.get_pron(u('toto')), "t-o-t-o") self.assertEqual(d.get_pron(u('titi')), "t-i-t-i") self.assertEqual(d.get_pron(u('tata')), "t-a-t-a") self.assertEqual(d.get_pron(u('tutu')), "t-u-t-u") self.assertEqual(d.get_pron(u('tyty')), "t-y-t-y") self.assertEqual(d.get_pron(u('tete')), "t-e-t-e")
def test_add(self): # simple and normal situation d = sppasDictPron() d.add_pron("a", "a") d.add_pron("A", "a") self.assertEqual(len(d), 1) # unicode d = sppasDictPron() d.add_pron("é", "e") d.add_pron("É", "e") d.add_pron(u("É"), "e") self.assertEqual(len(d), 1) self.assertTrue("É" in d) self.assertTrue(u("É") in d)
def test_num2letterES(self): num = sppasNum('spa') ret = [num.convert(i) for i in range(41)] self.assertEquals(ret, ref_es) s = num.convert(1241) self.assertEquals(s, u("mil-doscientos-cuarenta-y-uno")) s = num.convert(2346022) self.assertEquals(s, u("dos-millones-trescientos-cuarenta-y-seis-mil-veintidós")) s = num.convert(382121) self.assertEquals(s, u("trescientos-ochenta-y-dos-mil-ciento-veintiuno")) s = num.convert(739499) self.assertEquals(s, u("setecientos-treinta-y-nueve-mil-cuatrocientos-noventa-y-nueve"))
def convert(self, number): """ Convert a number to a string. Example: 23 => twenty-three :param number: (int) A numerical representation :returns: string corresponding to the given number :raises: ValueError """ if self._lang not in sppasNum.LANGUAGES: raise ValueError("Unknown language {:s} for numerical conversion".format(self._lang)) number = str(number) if number.isdigit() is False: raise ValueError("Numerical conversion is available only for positive unsigned integers. Got {:s}.".format(number)) _strnum = "" _w = str(number) _i = int(number) # Numbers starting by one or more '0' (like phone numbers...) while _w.startswith(u("0")): _strnum = _strnum + self.zero() _w = _w[1:] if len(_w) > 0: _strnum = _strnum + self.__convert(_i) return ' '.join(_strnum.split())