Esempio n. 1
0
    def __init__(self, vocab, lang="und", logfile=None):
        """ Create a sppasTok instance.

        :param vocab: (str) name of the file with the orthographic transcription
        :param lang: (str) the language code
        :param logfile: (sppasLog)

        """
        sppasBaseAnnotation.__init__(self, logfile)

        self.normalizer = None
        voc = sppasVocabulary(vocab)
        self.normalizer = TextNormalizer(voc, lang)

        # Replacement dictionary
        replace_filename = os.path.join(RESOURCES_PATH, "repl", lang + ".repl")
        if os.path.exists(replace_filename) is True:
            dict_replace = sppasDictRepl(replace_filename, nodump=True)
        else:
            dict_replace = sppasDictRepl()
        self.normalizer.set_repl(dict_replace)

        # Punctuations dictionary
        punct_filename = os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt")
        if os.path.exists(punct_filename) is True:
            vocab_punct = sppasVocabulary(punct_filename, nodump=True)
        else:
            vocab_punct = sppasVocabulary()
        self.normalizer.set_punct(vocab_punct)

        # List of options to configure this automatic annotation
        self._options['faked'] = True
        self._options['std'] = False
        self._options['custom'] = False
Esempio n. 2
0
    def test_sampa(self):

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True)
        self.tok.set_repl(repl)

        s = self.tok.normalize(u("[le mot,/lemot/]"), [])
        self.assertEqual(u("/lemot/"), s)
        s = self.tok.normalize(u("[le mot,/lemot/]"), ["std"])
        self.assertEqual(u("le_mot"), s)
        s = self.tok.normalize(u("[le mot,/lemot/]"))
        self.assertEqual(u("/lemot/"), s)

        # minus is accepted in sampa transcription (it is the phonemes separator)
        s = self.tok.normalize(u(" /l-e-f-o~-n/ "))
        self.assertEqual(u("/l-e-f-o~-n/"), s)

        s = self.tok.normalize(u(" /le~/ "))
        self.assertEqual(u("/le~/"), s)

        # whitespace is not accepted in sampa transcription
        s = self.tok.normalize(u(" /le mot/ "))
        self.assertEqual(u("le mot"), s)

        t = sppasTranscription()
        s = t.clean_toe(u("ah a/b euh"))
        self.assertEqual(s, u("ah a/b euh"))
Esempio n. 3
0
    def test_replace(self):

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace([u("un"), u("taux"), u("de"), u("croissance"), u("de"), u("0,5"), u("%")])
        self.assertEquals(s, [u("un"), u("taux"), u("de"), u("croissance"), u("de"), u("0"), u("virgule"), u("5"),
                              u("pourcents")])

        text = [u("² % °c  km/h  etc   €  ¥ $ ")]

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "eng.repl"), nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u("square percent degrees_Celsius km/h etc euros yens dollars"))

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "spa.repl"), nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u("quadrados por_ciento grados_Celsius km/h etc euros yens dollars"))

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s),
                          u("carrés pourcents degrés_celcius kilomètres_heure etcetera euros yens dollars"))

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "ita.repl"), nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u("quadrato percento gradi_Celsius km/h etc euros yens dollars"))

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "cmn.repl"), nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(" ".join(s), u("的平方 个百分比 摄氏度 公里每小时 etc € ¥ $"))
Esempio n. 4
0
    def __init__(self, lang, dict_replace=None, speech=True):
        """Creates a sppasSimpleSplitter.

        :param lang: the language code in iso639-3.
        :param dict_replace: Replacement dictionary
        :param speech: (bool) split transcribed speech vs written text

        """
        self.__lang = lang
        self.__speech = speech
        if dict_replace is not None:
            self.__repl = dict_replace
        else:
            self.__repl = sppasDictRepl(None)
Esempio n. 5
0
    def __init__(self, lang, dict_replace=None, speech=True):
        """ Creates a sppasTokSplitter.

        :param lang: the language code in iso639-3.
        :param dict_replace: Replacement dictionary
        :param speech: (bool) split transcribed speech vs written text

        """
        self.__lang = lang
        self.__speech = speech
        if dict_replace is not None:
            self.__repl = dict_replace
        else:
            self.__repl = sppasDictRepl(None)
Esempio n. 6
0
    def test_num2letter(self):
        """ Test the integration of num2letter into the TextNormalizer. """

        repl = sppasDictRepl(os.path.join(RESOURCES_PATH, "repl", "fra.repl"), nodump=True)
        self.tok.set_repl(repl)
        self.tok.set_lang("fra")

        s = self.tok.normalize(u("123"))
        self.assertEquals(s, u("cent-vingt-trois"))

        s = self.tok.normalize(u("1,24"))
        self.assertEquals(s, u("un virgule vingt-quatre"))

        self.tok.set_lang("cat")
        with self.assertRaises(ValueError):
            self.tok.normalize(u("123"))
Esempio n. 7
0
    def __init__(self, vocab=None, lang="und"):
        """ Create a TextNormalizer instance.

        :param vocab: (sppasVocabulary)
        :param lang: the language code in iso639-3.

        """
        # resources
        self.dicoutf = DictReplUTF8()
        self.repl = sppasDictRepl(None)
        self.punct = sppasVocabulary()
        self.vocab = vocab
        if vocab is None:
            self.vocab = sppasVocabulary()

        # members
        self.lang = lang
        self.delimiter = ' '
Esempio n. 8
0
    def __init__(self, vocab=None, lang="und"):
        """ Create a TextNormalizer instance.

        :param vocab: (sppasVocabulary)
        :param lang: the language code in iso639-3.

        """
        # resources
        self.dicoutf = DictReplUTF8()
        self.repl = sppasDictRepl(None)
        self.punct = sppasVocabulary()
        self.vocab = vocab
        if vocab is None:
            self.vocab = sppasVocabulary()

        # members
        self.lang = lang
        self.delimiter = ' '
Esempio n. 9
0
    def test_num2letter(self):
        """... Integration of num2letter into the TextNormalizer."""

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "fra.repl"),
                             nodump=True)
        self.tok.set_repl(repl)
        self.tok.set_lang("fra")

        self.assertEquals([u("cent-vingt-trois")],
                          self.tok.normalize(u("123")))

        self.assertEquals(
            u("un virgule vingt-quatre").split(),
            self.tok.normalize(u("1,24")))

        self.tok.set_lang("deu")
        with self.assertRaises(ValueError):
            self.tok.normalize(u("123"))
Esempio n. 10
0
    def test_sampa(self):
        """... X-SAMPA included into the ortho transcription."""

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "fra.repl"),
                             nodump=True)
        self.tok.set_repl(repl)

        self.assertEqual([u("/lemot/")],
                         self.tok.normalize(u("[le mot,/lemot/]"), []))
        self.assertEqual([u("le_mot")],
                         self.tok.normalize(u("[le mot,/lemot/]"), ["std"]))
        self.assertEqual([u("/lemot/")],
                         self.tok.normalize(u("[le mot,/lemot/]")))

        # minus is accepted in sampa transcription (it is the phonemes separator)
        self.assertEqual([u("/l-e-f-o~-n/")],
                         self.tok.normalize(u(" /l-e-f-o~-n/ ")))
        self.assertEqual([u("/le~/")], self.tok.normalize(u(" /le~/ ")))

        # whitespace is not accepted in sampa transcription
        self.assertEqual(
            u("le mot").split(), self.tok.normalize(u(" /le mot/ ")))
Esempio n. 11
0
    def test_replace(self):
        """... Examine tokens and performs some replacements."""

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "fra.repl"),
                             nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace([
            u("un"),
            u("taux"),
            u("de"),
            u("croissance"),
            u("de"),
            u("0,5"),
            u("%")
        ])
        self.assertEquals(s, [
            u("un"),
            u("taux"),
            u("de"),
            u("croissance"),
            u("de"),
            u("0"),
            u("virgule"),
            u("5"),
            u("pourcents")
        ])

        text = [u("² % °c  km/h  etc   €  ¥ $ ")]

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "eng.repl"),
                             nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(
            u("square percent degrees_Celsius km/h etc euros yens dollars"),
            " ".join(s))

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "spa.repl"),
                             nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(
            u("quadrados por_ciento grados_Celsius km/h etc euros yens dollars"
              ), " ".join(s))

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "fra.repl"),
                             nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(
            u("carrés pourcents degrés_celcius kilomètres_heure etcetera euros yens dollars"
              ), " ".join(s))

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "ita.repl"),
                             nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(
            u("quadrato percento gradi_Celsius km/h etc euros yens dollars"),
            " ".join(s))

        repl = sppasDictRepl(os.path.join(paths.resources, "repl", "cmn.repl"),
                             nodump=True)
        self.tok.set_repl(repl)
        s = self.tok.replace(text)
        self.assertEquals(u("的平方 个百分比 摄氏度 公里每小时 etc € ¥ $"), " ".join(s))
Esempio n. 12
0
    p = sppasTextNorm(args.vocab, lang)
    if args.nofaked:
        p.set_faked(False)
    if args.std:
        p.set_std(True)
    if args.custom:
        p.set_custom(True)
    p.run(args.i, args.o)

else:

    vocab = sppasVocabulary(args.vocab)
    normalizer = TextNormalizer(vocab, lang)

    replace_file = os.path.join(RESOURCES_PATH, "repl", lang + ".repl")
    if os.path.exists(replace_file):
        repl = sppasDictRepl(replace_file, nodump=True)
        normalizer.set_repl(repl)

    punct_file = os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt")
    if os.path.exists(punct_file):
        punct = sppasVocabulary(punct_file, nodump=True)
        normalizer.set_punct(punct)

    # Will output the faked orthography
    for line in sys.stdin:
        tokens = normalizer.normalize(line)
        for token in tokens:
            print("{!s:s}".format(token))  #.encode('utf8'))