Example #1
0
    def __init__(self, vocab, lang="und", logfile=None):
        """ Create a sppasTok instance.

        :param vocab: (str) name of the file with the orthographic transcription
        :param lang: (str) the language code
        :param logfile: (sppasLog)

        """
        sppasBaseAnnotation.__init__(self, logfile)

        self.normalizer = None
        voc = sppasVocabulary(vocab)
        self.normalizer = TextNormalizer(voc, lang)

        # Replacement dictionary
        replace_filename = os.path.join(RESOURCES_PATH, "repl", lang + ".repl")
        if os.path.exists(replace_filename) is True:
            dict_replace = sppasDictRepl(replace_filename, nodump=True)
        else:
            dict_replace = sppasDictRepl()
        self.normalizer.set_repl(dict_replace)

        # Punctuations dictionary
        punct_filename = os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt")
        if os.path.exists(punct_filename) is True:
            vocab_punct = sppasVocabulary(punct_filename, nodump=True)
        else:
            vocab_punct = sppasVocabulary()
        self.normalizer.set_punct(vocab_punct)

        # List of options to configure this automatic annotation
        self._options['faked'] = True
        self._options['std'] = False
        self._options['custom'] = False
 def setUp(self):
     dict_dir = os.path.join(paths.resources, "vocab")
     vocab_file = os.path.join(dict_dir, "fra.vocab")
     punct_file = os.path.join(dict_dir, "Punctuations.txt")
     wds = sppasVocabulary(vocab_file)
     puncts = sppasVocabulary(punct_file)
     self.tok = TextNormalizer(wds, "fra")
     self.tok.set_punct(puncts)
Example #3
0
 def setUp(self):
     dict_dir = os.path.join(RESOURCES_PATH, "vocab")
     vocab_file = os.path.join(dict_dir, "fra.vocab")
     punct_file = os.path.join(dict_dir, "Punctuations.txt")
     wds = sppasVocabulary(vocab_file)
     puncts = sppasVocabulary(punct_file)
     self.tok = TextNormalizer(wds, "fra")
     self.tok.set_punct(puncts)
Example #4
0
    def __init__(self, vocab=None, lang="und"):
        """ Create a TextNormalizer instance.

        :param vocab: (sppasVocabulary)
        :param lang: the language code in iso639-3.

        """
        # resources
        self.dicoutf = DictReplUTF8()
        self.repl = sppasDictRepl(None)
        self.punct = sppasVocabulary()
        self.vocab = vocab
        if vocab is None:
            self.vocab = sppasVocabulary()

        # members
        self.lang = lang
        self.delimiter = ' '
Example #5
0
    def __init__(self, vocab=None, lang="und"):
        """ Create a TextNormalizer instance.

        :param vocab: (sppasVocabulary)
        :param lang: the language code in iso639-3.

        """
        # resources
        self.dicoutf = DictReplUTF8()
        self.repl = sppasDictRepl(None)
        self.punct = sppasVocabulary()
        self.vocab = vocab
        if vocab is None:
            self.vocab = sppasVocabulary()

        # members
        self.lang = lang
        self.delimiter = ' '
Example #6
0
    def set_vocab(self, filename):
        """ Fix a list of accepted tokens; others are mentioned as unknown.

        :param filename: (str) List of tokens.

        """
        self.wrdlist = sppasVocabulary(filename,
                                       nodump=True,
                                       case_sensitive=False)
Example #7
0
    def test_code_switching(self):

        dictdir  = os.path.join(RESOURCES_PATH, "vocab")
        vocabfra = os.path.join(dictdir, "fra.vocab")
        vocabcmn = os.path.join(dictdir, "cmn.vocab")

        wds = sppasVocabulary(vocabfra)
        wds.load_from_ascii(vocabcmn)
        self.assertEquals(len(wds), 457922)
    def test_code_switching(self):
        """... [TO DO] support of language switching."""

        dictdir = os.path.join(paths.resources, "vocab")
        vocabfra = os.path.join(dictdir, "fra.vocab")
        vocabcmn = os.path.join(dictdir, "cmn.vocab")

        wds = sppasVocabulary(vocabfra)
        wds.load_from_ascii(vocabcmn)
        self.assertEquals(len(wds), 456381)
Example #9
0
    def __init__(self, resource_file="", logfile=None):
        """ Create a new sppasRepetition instance.

        :param resource_file: Either the lemma dictionary or the list of stop-words.

        Attention: the extension of the resource file name is very important:
        must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)!

        """
        sppasBaseAnnotation.__init__(self, logfile)

        # Members
        self._use_lemmatize = True   # Lemmatize the input
        self._use_stopwords = True   # Add specific stopwords of the input
        self._span = 5               # Detection length (nb of IPUs; 1=current IPU)
        self._alpha = 0.5            # Specific stop-words threshold coefficient
        self.lemmatizer = LemmaDict()
        self.stop_words = sppasVocabulary()

        # Create the lemmatizer instance
        try:
            lemma_file = resource_file.replace(".stp", ".lem")
            self.lemmatizer.load(lemma_file)
        except:
            self._use_lemmatize = False

        if self._use_lemmatize is False:
            if logfile is not None:
                logfile.print_message("Lemmatization disabled.", indent=2, status=3)
            else:
                print(" ... ... [ INFO ] Lemmatization disabled.")
        else:
            if logfile is not None:
                logfile.print_message("Lemmatization enabled.", indent=2, status=3)
            else:
                print(" ... ... [ INFO ] Lemmatization enabled.")

        # Create the list of stop words (list of non-relevant words)
        try:
            stop_file = resource_file.replace(".lem", ".stp")
            self.stop_words.load_from_ascii(stop_file)
        except:
            pass

        if self._use_stopwords is False:
            if logfile is not None:
                logfile.print_message("StopWords disabled.", indent=2, status=3)
            else:
                print(" ... ... [ INFO ] StopWords disabled.")
        else:
            if logfile is not None:
                logfile.print_message("StopWords: {:d}".format(len(self.stop_words)), indent=2, status=3)
            else:
                print(" ... ... [ INFO ] StopWords: {:d}".format(len(self.stop_words)))
Example #10
0
    def __init__(self, stop_list=None):
        """Create a SelfRules instance.

        :param stop_list: (sppasVocabulary or list) Un-relevant tokens.

        """
        self.__stoplist = sppasVocabulary()
        if stop_list is not None:
            if isinstance(stop_list, sppasVocabulary):
                self.__stoplist = stop_list
            else:
                for token in stop_list:
                    self.__stoplist.add(token)
Example #11
0
    def testVocab(self):
        wds = sppasVocabulary()
        wds.add("a")
        wds.add("b")
        wds.add("c")
        ngramcounter = sppasNgramCounter(1, wds)
        ngramcounter.count(self.corpusfile)

        self.assertEqual(ngramcounter.get_count('a'), 15)
        self.assertEqual(ngramcounter.get_count('b'), 10)
        self.assertEqual(ngramcounter.get_count('c'), 4)
        self.assertEqual(ngramcounter.get_count('d'), 0)
        self.assertEqual(ngramcounter.get_count(symbols.unk), 3)
        self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
        self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
Example #12
0
    def testVocab(self):
        wds = sppasVocabulary()
        wds.add("a")
        wds.add("b")
        wds.add("c")
        ngramcounter = sppasNgramCounter(1, wds)
        ngramcounter.count(self.corpusfile)

        self.assertEqual(ngramcounter.get_count('a'), 15)
        self.assertEqual(ngramcounter.get_count('b'), 10)
        self.assertEqual(ngramcounter.get_count('c'), 4)
        self.assertEqual(ngramcounter.get_count('d'), 0)
        self.assertEqual(ngramcounter.get_count(unk_stamp), 3)
        self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0)
        self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
Example #13
0
    def get_stop_list(self, tier=None):
        """ Return the expected list of stop-words.
        It is either:

            - the loaded list or,
            - the loaded list + un-relevant tokens, estimated on the basis
            of the given tier.

        A token 'w' is relevant for the speaker if its probability is
        less than a threshold:

            | P(w) <= 1 / (alpha * V)

        where 'alpha' is an empirical coefficient and 'V' is the vocabulary
        size of the speaker.

        :param tier: (Tier) A tier with entries to be analyzed.

        """
        if self._use_stopwords is False:
            return sppasVocabulary()

        if tier is None:
            return self.stop_words

        # Create the sppasUnigram and put data
        u = sppasUnigram()
        for a in tier:
            if a.GetLabel().IsSpeech() is True:
                u.add(a.GetLabel().GetValue())

        # Estimate values for relevance
        _v = float(len(u))
        threshold = 1. / (self._alpha * _v)

        # Estimate if a token is relevant; if not: put in the stop-list
        stop_list = self.stop_words.copy()
        for token in u.get_tokens():
            p_w = float(u.get_count(token)) / float(u.get_sum())
            if p_w > threshold:
                stop_list.add(token)
                if self.logfile is not None:
                    self.logfile.print_message(
                        'Add in the stop-list: {:s}'.format(token), indent=3)

        return stop_list
Example #14
0
    def get_stop_list(self, tier=None):
        """ Return the expected list of stop-words.
        It is either:

            - the loaded list or,
            - the loaded list + un-relevant tokens, estimated on the basis
            of the given tier.

        A token 'w' is relevant for the speaker if its probability is
        less than a threshold:

            | P(w) <= 1 / (alpha * V)

        where 'alpha' is an empirical coefficient and 'V' is the vocabulary
        size of the speaker.

        :param tier: (Tier) A tier with entries to be analyzed.

        """
        if self._use_stopwords is False:
            return sppasVocabulary()

        if tier is None:
            return self.stop_words

        # Create the sppasUnigram and put data
        u = sppasUnigram()
        for a in tier:
            if a.GetLabel().IsSpeech() is True:
                u.add(a.GetLabel().GetValue())

        # Estimate values for relevance
        _v = float(len(u))
        threshold = 1. / (self._alpha * _v)

        # Estimate if a token is relevant; if not: put in the stop-list
        stop_list = self.stop_words.copy()
        for token in u.get_tokens():
            p_w = float(u.get_count(token)) / float(u.get_sum())
            if p_w > threshold:
                stop_list.add(token)
                if self.logfile is not None:
                    self.logfile.print_message('Add in the stop-list: {:s}'.format(token), indent=3)

        return stop_list
Example #15
0
lang = base[:3]

if args.i:

    p = sppasTextNorm(args.vocab, lang)
    if args.nofaked:
        p.set_faked(False)
    if args.std:
        p.set_std(True)
    if args.custom:
        p.set_custom(True)
    p.run(args.i, args.o)

else:

    vocab = sppasVocabulary(args.vocab)
    normalizer = TextNormalizer(vocab, lang)

    replace_file = os.path.join(RESOURCES_PATH, "repl", lang + ".repl")
    if os.path.exists(replace_file):
        repl = sppasDictRepl(replace_file, nodump=True)
        normalizer.set_repl(repl)

    punct_file = os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt")
    if os.path.exists(punct_file):
        punct = sppasVocabulary(punct_file, nodump=True)
        normalizer.set_punct(punct)

    # Will output the faked orthography
    for line in sys.stdin:
        tokens = normalizer.normalize(line)
Example #16
0
    def __init__(self, resource_file="", logfile=None):
        """ Create a new sppasRepetition instance.

        :param resource_file: Either the lemma dictionary or the list of stop-words.

        Attention: the extension of the resource file name is very important:
        must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)!

        """
        sppasBaseAnnotation.__init__(self, logfile, "Repetitions")

        # Members
        self._use_lemmatize = True  # Lemmatize the input
        self._use_stopwords = True  # Add specific stopwords of the input
        self._span = 5  # Detection length (nb of IPUs; 1=current IPU)
        self._alpha = 0.5  # Specific stop-words threshold coefficient
        self.lemmatizer = LemmaDict()
        self.stop_words = sppasVocabulary()

        # Create the lemmatizer instance
        try:
            lemma_file = resource_file.replace(".stp", ".lem")
            self.lemmatizer.load(lemma_file)
        except:
            self._use_lemmatize = False

        if self._use_lemmatize is False:
            if logfile is not None:
                logfile.print_message("Lemmatization disabled.",
                                      indent=2,
                                      status=3)
            else:
                print(" ... ... [ INFO ] Lemmatization disabled.")
        else:
            if logfile is not None:
                logfile.print_message("Lemmatization enabled.",
                                      indent=2,
                                      status=3)
            else:
                print(" ... ... [ INFO ] Lemmatization enabled.")

        # Create the list of stop words (list of non-relevant words)
        try:
            stop_file = resource_file.replace(".lem", ".stp")
            self.stop_words.load_from_ascii(stop_file)
        except:
            pass

        if self._use_stopwords is False:
            if logfile is not None:
                logfile.print_message("StopWords disabled.",
                                      indent=2,
                                      status=3)
            else:
                print(" ... ... [ INFO ] StopWords disabled.")
        else:
            if logfile is not None:
                logfile.print_message("StopWords: {:d}".format(
                    len(self.stop_words)),
                                      indent=2,
                                      status=3)
            else:
                print(" ... ... [ INFO ] StopWords: {:d}".format(
                    len(self.stop_words)))