def setUp(self): self.dd = sppasDictPron() self.grph = sppasDictPhonetizer(self.dd) self.dd.add_pron("a", "a") self.dd.add_pron("b", "b") self.dd.add_pron("c", "c") self.dd.add_pron(SP_ORTHO, SP)
def gen_slm_dependencies(self, basename, N=3): """Generate the dependencies (slm, dictionary) for julius. :param basename: (str) base name of the slm and dictionary files :param N: (int) Language model N-gram length. """ dict_name = basename + ".dict" slm_name = basename + ".arpa" phoneslist = self._phones.split() tokenslist = self._tokens.split() dictpron = sppasDictPron() for token, pron in zip(tokenslist, phoneslist): for variant in pron.split("|"): dictpron.add_pron(token, variant.replace("-", " ")) if dictpron.is_unk(START_SENT_SYMBOL) is True: dictpron.add_pron(START_SENT_SYMBOL, SIL_PHON) if dictpron.is_unk(END_SENT_SYMBOL) is True: dictpron.add_pron(END_SENT_SYMBOL, SIL_PHON) dictpron.save_as_ascii(dict_name, False) # Write the SLM model = sppasNgramsModel(N) model.append_sentences([self._tokens]) probas = model.probabilities(method="logml") arpaio = sppasArpaIO() arpaio.set(probas) arpaio.save(slm_name)
def gen_slm_dependencies(self, basename, N=3): """ Generate the dependencies (slm, dictionary) for julius. :param basename: (str) the base name of the slm file and of the dictionary file :param N: (int) Language model N-gram length. """ dictname = basename + ".dict" slmname = basename + ".arpa" phoneslist = self._phones.split() tokenslist = self._tokens.split() dictpron = sppasDictPron() for token, pron in zip(tokenslist, phoneslist): for variant in pron.split("|"): dictpron.add_pron(token, variant.replace("-", " ")) if dictpron.is_unk(START_SENT_SYMBOL) is True: dictpron.add_pron(START_SENT_SYMBOL, "sil") if dictpron.is_unk(END_SENT_SYMBOL) is True: dictpron.add_pron( END_SENT_SYMBOL, "sil") dictpron.save_as_ascii(dictname, False) # Write the SLM model = sppasNgramsModel(N) model.append_sentences([self._tokens]) probas = model.probabilities(method="logml") arpaio = sppasArpaIO() arpaio.set(probas) arpaio.save(slmname)
def setUp(self): self.dd = sppasDictPron() self.grph = sppasDictPhonetizer(self.dd) self.dd.add_pron("a", "a") self.dd.add_pron("b", "b") self.dd.add_pron("c", "c") self.dd.add_pron("+", "sp")
def set_dict(self, dict_filename): """ Set the pronunciation dictionary. :param dict_filename: (str) The pronunciation dictionary in HTK-ASCII format with UTF-8 encoding. """ pdict = sppasDictPron(dict_filename, nodump=False) self.phonetizer = sppasDictPhonetizer(pdict, self.maptable)
def add_from_dict(self, dict_filename): """Add the list of phones from a pronunciation dictionary. :param dict_filename: (str) Name of an HTK-ASCII pronunciation dict """ d = sppasDictPron(dict_filename) for key in d: value = d.get_pron(key) variants = value.split(separators.variants) for variant in variants: phones = variant.split(separators.phonemes) for phone in phones: self.add(phone)
def add_from_dict(self, dict_filename): """ Add the list of phones from a pronunciation dictionary. :param dict_filename: (str) Name of an HTK-ASCII pronunciation dictionary """ d = sppasDictPron(dict_filename) for key in d: value = d.get_pron(key) variants = value.split(VARIANTS_SEPARATOR) for variant in variants: phones = variant.split(PHONEMES_SEPARATOR) for phone in phones: self.add(phone)
def add_from_dict(self, dict_filename): """ Add the list of phones from a pronunciation dictionary. :param dict_filename: (str) Name of an HTK-ASCII pronunciation dictionary """ d = sppasDictPron(dict_filename) for key in d: value = d.get_pron(key) variants = value.split(sppasDictPron.VARIANTS_SEPARATOR) for variant in variants: phones = variant.split(sppasDictPron.PHONEMES_SEPARATOR) for phone in phones: self.add(phone)
def test_phonetizeFR(self): dictdir = os.path.join(SPPAS, "resources", "dict") dictfile = os.path.join(dictdir, "fra.dict") dd = sppasDictPron(dictfile) grph = sppasDictPhonetizer(dd) result = grph.phonetize('pas_encore', phonunk=False) self.assertEqual(result, 'UNK') result = grph.phonetize('pas_encore', phonunk=True) self.assertEqual(result, "p.a.a~.k.o.r|p.a.z.a~.k.o.r|p.a.a~.k.o.r.eu|p.a.z.a~.k.o.r.eu") result = grph.phonetize(u'/lemot/', phonunk=True) self.assertEqual(result, u"lemot") result = grph.phonetize(u'/lemot/', phonunk=False) self.assertEqual(result, u"lemot")
def test_phonetizeFR(self): dictdir = os.path.join(SPPAS, "resources", "dict") dictfile = os.path.join(dictdir, "fra.dict") dd = sppasDictPron(dictfile) grph = sppasDictPhonetizer(dd) result = grph.phonetize('pas_encore', phonunk=False) self.assertEqual(result, 'UNK') result = grph.phonetize('pas_encore', phonunk=True) self.assertEqual( result, "p.a.a~.k.o.r|p.a.z.a~.k.o.r|p.a.a~.k.o.r.eu|p.a.z.a~.k.o.r.eu") result = grph.phonetize(u'/lemot/', phonunk=True) self.assertEqual(result, u"lemot") result = grph.phonetize(u'/lemot/', phonunk=False) self.assertEqual(result, u"lemot")
def test_data(self): dictfile = os.path.join(RESOURCES_PATH, "dict", "eng.dict") map_table = os.path.join(RESOURCES_PATH, "dict", "eng-fra.map") mapt = sppasMapping(map_table) dd = sppasDictPron(dictfile) grph = sppasDictPhonetizer(dd) self.assertEqual(grph.get_phon_entry("THE"), "D-@|D-V|D-i:") self.assertEqual(grph.get_phon_entry("UR"), "3:r|U-r\\") self.assertEqual(grph.get_phon_entry("ARE"), "A-r\|3:r") self.assertEqual(grph.get_phon_entry("BANC"), "b-{-N-k") grph.set_maptable(mapt) the = "z-@|D-@|v-@|v-V|D-V|z-V|z-9|D-9|v-9|z-i:|z-i|D-i|v-i|D-i:|v-i:" ur = "3:r|9-R|u-r\|U-w|u-w|U-R|U-r\|u-R" are = "a-R|A-R|a-w|A-w|a-r\|A-r\|3:r|9-R" self.assertEqual(set(grph.get_phon_entry("THE").split("|")), set(the.split("|"))) self.assertEqual(set(grph.get_phon_entry("UR").split("|")), set(ur.split("|"))) self.assertEqual(set(grph.get_phon_entry("ARE").split("|")), set(are.split("|")))
def test_phon_from_loaded_data(self): """... Phonetization using real resource data.""" dict_file = os.path.join(paths.resources, "dict", "eng.dict") map_table = os.path.join(paths.resources, "dict", "eng-fra.map") mapt = sppasMapping(map_table) dd = sppasDictPron(dict_file) grph = sppasDictPhonetizer(dd) self.assertEqual(set("D-@|D-V|D-i:".split('|')), set(grph.get_phon_entry("THE").split('|'))) self.assertEqual(set("3:r|U-r\\".split('|')), set(grph.get_phon_entry("UR").split('|'))) self.assertEqual(set("A-r\\|3:r".split('|')), set(grph.get_phon_entry("ARE").split('|'))) self.assertEqual(set("b-{-N-k".split('|')), set(grph.get_phon_entry("BANC").split('|'))) grph.set_maptable(mapt) grph.set_unk_variants(0) # DICT: the [] D @ / the(2) [] D V / the(3) [] D i: # MAP: D z / i: i / V 9 / V @ self.assertEqual( set("D-@|D-V|D-i:|z-@|z-V|z-i:|D-i|z-i|D-9|z-9|z-@".split("|")), set(grph.get_phon_entry("THE").split("|"))) # DICT: ur [] 3:r / ur(2) [] U r\ # MAP: 3:r 9-R / U u / r\ R / r\ w self.assertEqual(set("3:r|U-r\\|9-R|u-r\\|U-R|U-w|u-R|u-w".split("|")), set(grph.get_phon_entry("UR").split("|"))) # DICT = are [] A r\ / are(2) [] 3:r # MAP: r\ R / r\ w / 3:r 9-R / A a self.assertEqual(set("A-r\\|3:r|a-r\\|9-R|A-R|A-w|a-R|a-w".split("|")), set(grph.get_phon_entry("ARE").split("|")))
def gen_dependencies(self, grammarname, dictname): """ Generate the dependencies (grammar, dictionary) for HVite. :param grammarname: (str) the file name of the tokens :param dictname: (str) the dictionary file name """ dictpron = sppasDictPron() with codecs.open(grammarname, 'w', encoding) as flab: for token, pron in zip(self._tokens.split(), self._phones.split()): # dictionary: for variant in pron.split("|"): dictpron.add_pron(token, variant.replace("-", " ")) if self._infersp is True: variant = variant + '-sil' dictpron.add_pron(token, variant.replace("-", " ")) # lab file (one token per line) flab.write(token+"\n") dictpron.save_as_ascii(dictname)
def gen_dependencies(self, grammar_name, dict_name): """Generate the dependencies (grammar, dictionary) for HVite. :param grammar_name: (str) the file name of the tokens :param dict_name: (str) the dictionary file name """ dictpron = sppasDictPron() with codecs.open(grammar_name, 'w', sg.__encoding__) as flab: for token, pron in zip(self._tokens.split(), self._phones.split()): # dictionary: for variant in pron.split("|"): dictpron.add_pron(token, variant.replace("-", " ")) # if self._infersp is True: # variant = variant + '-sil' # dictpron.add_pron(token, variant.replace("-", " ")) # lab file (one token per line) flab.write(token + "\n") dictpron.save_as_ascii(dict_name)
help='Input dictionary file name (as many as wanted)') parser.add_argument("--quiet", action='store_true', help="Disable the verbosity") if len(sys.argv) <= 1: sys.argv.append('-h') args = parser.parse_args() # ---------------------------------------------------------------------------- args = parser.parse_args() pron_dict = sppasDictPron(args.i, nodump=True) for entry in pron_dict: prons = pron_dict.get_pron(entry) nb_chars = float(len(entry)) for pron in prons.split(sppasDictPron.VARIANTS_SEPARATOR): phonetization = pron.split(sppasDictPron.PHONEMES_SEPARATOR) nb_phones = float(len(phonetization)) if nb_phones < nb_chars * 0.5: print("{:s}\t{:s}\tsmall".format(entry.encode('utf8'), pron.encode('utf8'))) elif nb_phones > nb_chars * 1.8:
if not args.quiet: log_level = cg.log_level else: log_level = cg.quiet_log_level lgs = sppasLogSetup(log_level) lgs.stream_handler() # ---------------------------------------------------------------------------- with_variant_nb = True with_filled_brackets = True if args.no_variant_numbers: with_variant_nb = False if args.no_filled_brackets: with_filled_brackets = False merge_dict = sppasDictPron() # ---------------------------------------------------------------------------- args = parser.parse_args() for dict_file in args.i: if not args.quiet: print("Read input dictionary file: ") pron_dict = sppasDictPron(dict_file, nodump=True) if not args.quiet: print(" [ OK ]") for entry in pron_dict: prons = pron_dict.get_pron(entry) for pron in prons.split(separators.variants):
if not args.quiet: setup_logging(0, None) else: setup_logging(30, None) # ---------------------------------------------------------------------------- # Automatic Phonetization is here: # ---------------------------------------------------------------------------- unkopt = True if args.nounk: unkopt = False mapfile = None if args.map: mapfile = args.map if args.i: p = sppasPhon(args.dict, mapfile) p.set_unk(unkopt) p.set_usestdtokens(False) p.run(args.i, args.o) else: pdict = sppasDictPron(args.dict, nodump=False) maptable = sppasMapping() if mapfile is not None: maptable = sppasMapping(mapfile) phonetizer = sppasDictPhonetizer(pdict, maptable) for line in sys.stdin: print("{:s}".format(phonetizer.phonetize(line, unkopt)))
help='Input dictionary file name (as many as wanted)') parser.add_argument("--quiet", action='store_true', help="Disable the verbosity") if len(sys.argv) <= 1: sys.argv.append('-h') args = parser.parse_args() # ---------------------------------------------------------------------------- args = parser.parse_args() pron_dict = sppasDictPron(args.i, nodump=True) for entry in pron_dict: prons = pron_dict.get_pron(entry) nb_chars = float(len(entry)) for pron in prons.split(VARIANTS_SEPARATOR): phonetization = pron.split(PHONEMES_SEPARATOR) nb_phones = float(len(phonetization)) if nb_phones < nb_chars * 0.5: print("{:s}\t{:s}\tsmall".format(entry.encode('utf8'), pron.encode('utf8')))
help="Disable the verbosity") if len(sys.argv) <= 1: sys.argv.append('-h') args = parser.parse_args() # ---------------------------------------------------------------------------- with_variant_nb = True with_filled_brackets = True if args.no_variant_numbers: with_variant_nb = False if args.no_filled_brackets: with_filled_brackets = False merge_dict = sppasDictPron() # ---------------------------------------------------------------------------- args = parser.parse_args() for dict_file in args.i: if not args.quiet: print("Read input dictionary file: ") pron_dict = sppasDictPron(dict_file, nodump=True) if not args.quiet: print(" [ OK ]") for entry in pron_dict: prons = pron_dict.get_pron(entry) for pron in prons.split(sppasDictPron.VARIANTS_SEPARATOR):
# ---------------------------------------------------------------------------- if not args.quiet: setup_logging(1, None) # ---------------------------------------------------------------------------- # Automatic Phonetization is here: # ---------------------------------------------------------------------------- unkopt = True if args.nounk: unkopt = False mapfile = None if args.map: mapfile = args.map if args.i: p = sppasPhon(args.dict, mapfile) p.set_unk(unkopt) p.set_usestdtokens(False) p.run(args.i, args.o) else: pdict = sppasDictPron(args.dict, nodump=False) maptable = sppasMapping() if mapfile is not None: maptable = sppasMapping(mapfile) phonetizer = sppasDictPhonetizer(pdict, maptable) for line in sys.stdin: print("{:s}".format(phonetizer.phonetize(line, unkopt)))