def __init__(self, vocab, lang="und", logfile=None): """ Create a sppasTok instance. :param vocab: (str) name of the file with the orthographic transcription :param lang: (str) the language code :param logfile: (sppasLog) """ sppasBaseAnnotation.__init__(self, logfile) self.normalizer = None voc = sppasVocabulary(vocab) self.normalizer = TextNormalizer(voc, lang) # Replacement dictionary replace_filename = os.path.join(RESOURCES_PATH, "repl", lang + ".repl") if os.path.exists(replace_filename) is True: dict_replace = sppasDictRepl(replace_filename, nodump=True) else: dict_replace = sppasDictRepl() self.normalizer.set_repl(dict_replace) # Punctuations dictionary punct_filename = os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt") if os.path.exists(punct_filename) is True: vocab_punct = sppasVocabulary(punct_filename, nodump=True) else: vocab_punct = sppasVocabulary() self.normalizer.set_punct(vocab_punct) # List of options to configure this automatic annotation self._options['faked'] = True self._options['std'] = False self._options['custom'] = False
def setUp(self): dict_dir = os.path.join(paths.resources, "vocab") vocab_file = os.path.join(dict_dir, "fra.vocab") punct_file = os.path.join(dict_dir, "Punctuations.txt") wds = sppasVocabulary(vocab_file) puncts = sppasVocabulary(punct_file) self.tok = TextNormalizer(wds, "fra") self.tok.set_punct(puncts)
def setUp(self): dict_dir = os.path.join(RESOURCES_PATH, "vocab") vocab_file = os.path.join(dict_dir, "fra.vocab") punct_file = os.path.join(dict_dir, "Punctuations.txt") wds = sppasVocabulary(vocab_file) puncts = sppasVocabulary(punct_file) self.tok = TextNormalizer(wds, "fra") self.tok.set_punct(puncts)
def __init__(self, vocab=None, lang="und"): """ Create a TextNormalizer instance. :param vocab: (sppasVocabulary) :param lang: the language code in iso639-3. """ # resources self.dicoutf = DictReplUTF8() self.repl = sppasDictRepl(None) self.punct = sppasVocabulary() self.vocab = vocab if vocab is None: self.vocab = sppasVocabulary() # members self.lang = lang self.delimiter = ' '
def set_vocab(self, filename): """ Fix a list of accepted tokens; others are mentioned as unknown. :param filename: (str) List of tokens. """ self.wrdlist = sppasVocabulary(filename, nodump=True, case_sensitive=False)
def test_code_switching(self): dictdir = os.path.join(RESOURCES_PATH, "vocab") vocabfra = os.path.join(dictdir, "fra.vocab") vocabcmn = os.path.join(dictdir, "cmn.vocab") wds = sppasVocabulary(vocabfra) wds.load_from_ascii(vocabcmn) self.assertEquals(len(wds), 457922)
def test_code_switching(self): """... [TO DO] support of language switching.""" dictdir = os.path.join(paths.resources, "vocab") vocabfra = os.path.join(dictdir, "fra.vocab") vocabcmn = os.path.join(dictdir, "cmn.vocab") wds = sppasVocabulary(vocabfra) wds.load_from_ascii(vocabcmn) self.assertEquals(len(wds), 456381)
def __init__(self, resource_file="", logfile=None): """ Create a new sppasRepetition instance. :param resource_file: Either the lemma dictionary or the list of stop-words. Attention: the extension of the resource file name is very important: must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)! """ sppasBaseAnnotation.__init__(self, logfile) # Members self._use_lemmatize = True # Lemmatize the input self._use_stopwords = True # Add specific stopwords of the input self._span = 5 # Detection length (nb of IPUs; 1=current IPU) self._alpha = 0.5 # Specific stop-words threshold coefficient self.lemmatizer = LemmaDict() self.stop_words = sppasVocabulary() # Create the lemmatizer instance try: lemma_file = resource_file.replace(".stp", ".lem") self.lemmatizer.load(lemma_file) except: self._use_lemmatize = False if self._use_lemmatize is False: if logfile is not None: logfile.print_message("Lemmatization disabled.", indent=2, status=3) else: print(" ... ... [ INFO ] Lemmatization disabled.") else: if logfile is not None: logfile.print_message("Lemmatization enabled.", indent=2, status=3) else: print(" ... ... [ INFO ] Lemmatization enabled.") # Create the list of stop words (list of non-relevant words) try: stop_file = resource_file.replace(".lem", ".stp") self.stop_words.load_from_ascii(stop_file) except: pass if self._use_stopwords is False: if logfile is not None: logfile.print_message("StopWords disabled.", indent=2, status=3) else: print(" ... ... [ INFO ] StopWords disabled.") else: if logfile is not None: logfile.print_message("StopWords: {:d}".format(len(self.stop_words)), indent=2, status=3) else: print(" ... ... [ INFO ] StopWords: {:d}".format(len(self.stop_words)))
def __init__(self, stop_list=None): """Create a SelfRules instance. :param stop_list: (sppasVocabulary or list) Un-relevant tokens. """ self.__stoplist = sppasVocabulary() if stop_list is not None: if isinstance(stop_list, sppasVocabulary): self.__stoplist = stop_list else: for token in stop_list: self.__stoplist.add(token)
def testVocab(self): wds = sppasVocabulary() wds.add("a") wds.add("b") wds.add("c") ngramcounter = sppasNgramCounter(1, wds) ngramcounter.count(self.corpusfile) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(symbols.unk), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def testVocab(self): wds = sppasVocabulary() wds.add("a") wds.add("b") wds.add("c") ngramcounter = sppasNgramCounter(1, wds) ngramcounter.count(self.corpusfile) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(unk_stamp), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def get_stop_list(self, tier=None): """ Return the expected list of stop-words. It is either: - the loaded list or, - the loaded list + un-relevant tokens, estimated on the basis of the given tier. A token 'w' is relevant for the speaker if its probability is less than a threshold: | P(w) <= 1 / (alpha * V) where 'alpha' is an empirical coefficient and 'V' is the vocabulary size of the speaker. :param tier: (Tier) A tier with entries to be analyzed. """ if self._use_stopwords is False: return sppasVocabulary() if tier is None: return self.stop_words # Create the sppasUnigram and put data u = sppasUnigram() for a in tier: if a.GetLabel().IsSpeech() is True: u.add(a.GetLabel().GetValue()) # Estimate values for relevance _v = float(len(u)) threshold = 1. / (self._alpha * _v) # Estimate if a token is relevant; if not: put in the stop-list stop_list = self.stop_words.copy() for token in u.get_tokens(): p_w = float(u.get_count(token)) / float(u.get_sum()) if p_w > threshold: stop_list.add(token) if self.logfile is not None: self.logfile.print_message( 'Add in the stop-list: {:s}'.format(token), indent=3) return stop_list
def get_stop_list(self, tier=None): """ Return the expected list of stop-words. It is either: - the loaded list or, - the loaded list + un-relevant tokens, estimated on the basis of the given tier. A token 'w' is relevant for the speaker if its probability is less than a threshold: | P(w) <= 1 / (alpha * V) where 'alpha' is an empirical coefficient and 'V' is the vocabulary size of the speaker. :param tier: (Tier) A tier with entries to be analyzed. """ if self._use_stopwords is False: return sppasVocabulary() if tier is None: return self.stop_words # Create the sppasUnigram and put data u = sppasUnigram() for a in tier: if a.GetLabel().IsSpeech() is True: u.add(a.GetLabel().GetValue()) # Estimate values for relevance _v = float(len(u)) threshold = 1. / (self._alpha * _v) # Estimate if a token is relevant; if not: put in the stop-list stop_list = self.stop_words.copy() for token in u.get_tokens(): p_w = float(u.get_count(token)) / float(u.get_sum()) if p_w > threshold: stop_list.add(token) if self.logfile is not None: self.logfile.print_message('Add in the stop-list: {:s}'.format(token), indent=3) return stop_list
lang = base[:3] if args.i: p = sppasTextNorm(args.vocab, lang) if args.nofaked: p.set_faked(False) if args.std: p.set_std(True) if args.custom: p.set_custom(True) p.run(args.i, args.o) else: vocab = sppasVocabulary(args.vocab) normalizer = TextNormalizer(vocab, lang) replace_file = os.path.join(RESOURCES_PATH, "repl", lang + ".repl") if os.path.exists(replace_file): repl = sppasDictRepl(replace_file, nodump=True) normalizer.set_repl(repl) punct_file = os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt") if os.path.exists(punct_file): punct = sppasVocabulary(punct_file, nodump=True) normalizer.set_punct(punct) # Will output the faked orthography for line in sys.stdin: tokens = normalizer.normalize(line)
def __init__(self, resource_file="", logfile=None): """ Create a new sppasRepetition instance. :param resource_file: Either the lemma dictionary or the list of stop-words. Attention: the extension of the resource file name is very important: must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)! """ sppasBaseAnnotation.__init__(self, logfile, "Repetitions") # Members self._use_lemmatize = True # Lemmatize the input self._use_stopwords = True # Add specific stopwords of the input self._span = 5 # Detection length (nb of IPUs; 1=current IPU) self._alpha = 0.5 # Specific stop-words threshold coefficient self.lemmatizer = LemmaDict() self.stop_words = sppasVocabulary() # Create the lemmatizer instance try: lemma_file = resource_file.replace(".stp", ".lem") self.lemmatizer.load(lemma_file) except: self._use_lemmatize = False if self._use_lemmatize is False: if logfile is not None: logfile.print_message("Lemmatization disabled.", indent=2, status=3) else: print(" ... ... [ INFO ] Lemmatization disabled.") else: if logfile is not None: logfile.print_message("Lemmatization enabled.", indent=2, status=3) else: print(" ... ... [ INFO ] Lemmatization enabled.") # Create the list of stop words (list of non-relevant words) try: stop_file = resource_file.replace(".lem", ".stp") self.stop_words.load_from_ascii(stop_file) except: pass if self._use_stopwords is False: if logfile is not None: logfile.print_message("StopWords disabled.", indent=2, status=3) else: print(" ... ... [ INFO ] StopWords disabled.") else: if logfile is not None: logfile.print_message("StopWords: {:d}".format( len(self.stop_words)), indent=2, status=3) else: print(" ... ... [ INFO ] StopWords: {:d}".format( len(self.stop_words)))