def test_code_switching(self): dictdir = os.path.join(RESOURCES_PATH, "vocab") vocabfra = os.path.join(dictdir, "fra.vocab") vocabcmn = os.path.join(dictdir, "cmn.vocab") wds = WordsList(vocabfra) wds.load_from_ascii( vocabcmn ) self.assertEquals( wds.get_size(), 458002)
def test_all(self): l = WordsList( VOCAB ) self.assertEqual(l.get_size(), 20 ) self.assertTrue( l.is_unk('toto') ) self.assertFalse( l.is_unk('normale') ) self.assertFalse( l.is_unk("isn't") ) self.assertFalse( l.is_unk(u"đ") ) l.add(u"être") self.assertTrue( l.is_in(u"être") ) self.assertTrue( l.is_unk("être") )
def test_code_switching(self): dictdir = os.path.join(RESOURCES_PATH, "vocab") vocabfra = os.path.join(dictdir, "fra.vocab") vocabcmn = os.path.join(dictdir, "cmn.vocab") wds = WordsList(vocabfra) wds.load_from_ascii( vocabcmn ) self.assertEquals( wds.get_size(), 434333) self.tok.set_vocab( wds ) splitswitch = self.tok.tokenize(u'et il m\'a dit : "《干脆就把那部蒙人的闲法给废了拉倒!》RT @laoshipukong : 27日"') self.assertEqual(splitswitch, u"et il m' a dit 干脆 就 把 那 部 蒙 人 的 闲 法 给 废 了 拉倒 rt @ laoshipukong 二十七 日")
def __init__(self, filename=None): """ Constructor. Add events to the list: laughter, dummy, noise, silence. @param filename (str) is the phoneset file name, i.e. a file with 1 column. """ WordsList.__init__(self, filename, nodump=True, casesensitive=True) self.add("@@") self.add("dummy") self.add("gb") self.add("sil")
def __init__(self, resourcefile, logfile=None): """ Create a new sppasRepetition instance. @param resourcefile is either the lemma dictionary or the list of stop-words. Attention: the extention of the resource file name is very important: must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)! """ # Members self._merge = False # Merge input in the output self._use_lemmatize = True # Lemmatize the input self._use_stopwords = True # Add specific stopwords of the input self._empan = 5 # Detection length (nb of IPUs; 1=current IPU) self._alpha = 0.5 # Specific stop-words threshold coefficient self.logfile = logfile self.lemmatizer = None self.stopwords = None # Create the lemmatizer instance try: lemmafile = resourcefile.replace(".stp", ".lem") self.lemmatizer = LemmaDict(lemmafile) except Exception: self._use_lemmatize = False if (self._use_lemmatize is True and self.lemmatizer.get_size() == 0) or self._use_lemmatize is False: if logfile is not None: logfile.print_message("Lemmatization disabled.",indent=2,status=3) else: print " ... ... [ INFO ] Lemmatization disabled." self._use_lemmatize = False # Create the list of stop words (list of non-relevant words) try: stopfile = resourcefile.replace(".lem", ".stp") self.stopwords = WordsList(filename=resourcefile, nodump=True) if self.stopwords.get_size() == 0: self._use_stopwords = False except Exception: self.stopwords = WordsList() #if (self._use_stopwords is True and self.stopwords.get_size() == 0) or self._use_stopwords is False: if self._use_stopwords is False: if logfile is not None: logfile.print_message("StopWords disabled.",indent=2,status=3) else: print " ... ... [ INFO ] StopWords disabled."
def test_save(self): l = WordsList( VOCAB ) l.save( VOCAB2 ) l2 = WordsList( VOCAB2 ) self.assertEqual(l.get_size(), l2.get_size()) for w in l.get_list(): self.assertTrue(l2.is_in(w))
def test_save(self): l = WordsList( VOCAB, nodump=True ) l.save( VOCAB_TEST ) l2 = WordsList( VOCAB_TEST, nodump=True ) self.assertEqual(l.get_size(), l2.get_size()) for w in l.get_list(): self.assertTrue(l2.is_in(w))
def testVocab(self): wds = WordsList() wds.add("a") wds.add("b") wds.add("c") ngramcounter = NgramCounter(1,wds) ngramcounter.count( self.corpusfile ) self.assertEqual(ngramcounter.get_count('a'), 15) self.assertEqual(ngramcounter.get_count('b'), 10) self.assertEqual(ngramcounter.get_count('c'), 4) self.assertEqual(ngramcounter.get_count('d'), 0) self.assertEqual(ngramcounter.get_count(UNKSTAMP), 3) self.assertEqual(ngramcounter.get_count(START_SENT_SYMBOL), 0) self.assertEqual(ngramcounter.get_count(END_SENT_SYMBOL), 3)
def __init__(self, vocab=None, lang="und"): """ Create a new DictTok instance. @param vocab (WordsList) @param lang is the language code in iso639-3. """ # resources self.dicoutf = DictReplUTF8() self.repl = DictRepl(None) self.punct = WordsList(None) self.vocab = vocab self.speech = True # transcribed speech (and not written text) is to be tokenized if vocab is None: self.vocab = WordsList(None) # members self.lang = lang self.num2letter = sppasNum( lang ) self.delimiter = u' '
class DictTok: """ @authors: Brigitte Bigi, Tatsuya Watanabe @contact: [email protected] @license: GPL, v3 @summary: Tokenization automatic annotation. The creation of text corpora requires a sequence of processing steps in order to constitute, normalize, and then to directly exploit it by a given application. This class implements a generic approach for text normalization and concentrates on the aspects of methodology and linguistic engineering, which serve to develop a multi-purpose multilingual text corpus. This approach consists in splitting the text normalization problem in a set of minor sub-problems as language-independent as possible. From the manual Enriched Orthographic Transcription, two derived ortho. transcriptions are generated automatically by the tokenizer: the "standard" transcription (the list of orthographic tokens); the "faked spelling" that is a specific transcription from which the obtained phonetic tokens are used by the phonetization system. The following illustrates an utterance text normalization in French: - Transcription: j'ai on a j'ai p- (en)fin j'ai trouvé l(e) meilleur moyen c'était d(e) [loger,locher] chez des amis (English translation is: I've we've I've - well I found the best way was to live in friends' apartment') - Resulting Standard tokens: j' ai on a j' ai p- enfin j' ai trouvé le meilleur moyen c'était de loger chez des amis - Resulting Faked tokens: j' ai on a j' ai p- fin j' ai trouvé l meilleur moyen c'était d loche chez des amis See the whole description of the algorithm in the following reference: Brigitte Bigi (2011). A Multilingual Text Normalization Approach. 2nd Less-Resourced Languages workshop, 5th Language & Technology Conference, Poznan (Poland). """ # ------------------------------------------------------------------ def __init__(self, vocab=None, lang="und"): """ Create a new DictTok instance. @param vocab (WordsList) @param lang is the language code in iso639-3. """ # resources self.dicoutf = DictReplUTF8() self.repl = DictRepl(None) self.punct = WordsList(None) self.vocab = vocab self.speech = True # transcribed speech (and not written text) is to be tokenized if vocab is None: self.vocab = WordsList(None) # members self.lang = lang self.num2letter = sppasNum( lang ) self.delimiter = u' ' # End __init__ # ------------------------------------------------------------------ # ------------------------------------------------------------------ # Options # ------------------------------------------------------------------ def set_delim(self, delim): """ Set the delimiter, used to separate tokens. @param delim is a unicode character. """ self.delimiter = delim # End set_delim # ------------------------------------------------------------------------- def set_vocab(self,vocab): """ Set the lexicon. @param vocab is a WordsList(). """ self.vocab = vocab # ------------------------------------------------------------------------- def set_repl(self,repl): """ Set the dictionary of replacements. @param repl (ReplDict) """ self.repl = repl # ------------------------------------------------------------------------- def set_punct(self,punct): """ Set the list of punctuation. @param punct (WordsList) """ self.punct = punct # ------------------------------------------------------------------------- def set_lang(self,lang): """ Set the language. @param lang is the language code in iso639-3 (fra, eng, vie, cmn...). """ self.lang = lang # ------------------------------------------------------------------------- # ------------------------------------------------------------------------- # Language independent modules # ------------------------------------------------------------------------- def split_characters(self,utt): """ Split an utterance by characters. @param utt is the utterance (a transcription, a sentence, ...) in utf-8 @return A string (split character by character, using spaces) """ try: y = unicode(utt, 'utf-8') except Exception: y = utt tmp = " ".join( y ) # split all characters except numbers and ascii characters sstr = re.sub(u"([0-90-9a-zA-ZA-T\s]+\.?[0-90-9a-zA-ZA-T\s]+)", lambda o: u" %s " % o.group(0).replace(" ",""), tmp) # and dates... if not self.speech: sstr = re.sub(u"([0-90-9\s]+\.?[月年日\s]+)", lambda o: u" %s " % o.group(0).replace(" ",""), sstr) # and ・ sstr = re.sub(u'[\s]*・[\s]*', u"・", sstr) return sstr def split(self, utt, std=False): """ Split an utterance using spaces or split each character, depending on the language. @param utt (string): the utterance (a transcription, a sentence, ...) @param std (Boolean) @return A list (array of string) """ s = utt if self.lang == "cmn" or self.lang == "jpn" or self.lang == "yue": s = self.split_characters( s ) toks = s.split() s = "" for t in toks: if not "/" in t: #if not a phonetized entry if std is False: if self.lang != "cmn" and self.lang != "jpn" and self.lang != "yue": # Split numbers if sticked to characters # attention: do not replace [a-zA-Z] by [\w] (because \w includes numbers) # and not on asian languages: it can be a tone! s = re.sub(u'([0-9])([a-zA-Z])', ur'\1 \2', s) s = re.sub(u'([a-zA-Z])([0-9])', ur'\1 \2', s) # Split some punctuation s = re.sub(u'\\[\\]', ur'\\] \\[', s) # Split dots if sticked to a word s = re.sub(u' \.([\w-])', ur'. \1', s) s = re.sub(u'^\.([\w-])', ur'. \1', s) s = " ".join(toks) # Then split each time there is a space and return result s = rutils.ToStrip( s ) return s.split() # End split # ------------------------------------------------------------------ def __stick_longest(self, utt, attachement = "_"): """ Longest matching algorithm. """ tabtoks = utt.split(" ") i = len(tabtoks) while i>0: # try to stick all tokens _token = attachement.join(tabtoks) if self.vocab.is_unk(_token) is False: return (i,_token) tabtoks.pop() i -= 1 return (1,utt.split(" ")[0]) def stick(self, utt, attachement = "_"): """ Stick tokens of an utterance using '_'. Language independent. @param utt (list) the utterance (a transcription, a sentence, ...) @return A list of strings """ _utt = [] t1 = 0 while t1<len(utt): sl = utt[t1] # longest string ... in theory! lmax = t1+7 if lmax>len(utt): lmax = len(utt) for t2 in range(t1+1,lmax): sl = sl + " " + utt[t2] (i,tok) = self.__stick_longest( rutils.ToStrip( sl ), attachement) # real longest string! t1 += i _utt.append( rutils.ToStrip( tok ) ) return _utt # End stick # ------------------------------------------------------------------ def replace(self, utt): """ Examine tokens and performs some replacements. A dictionary with symbols contains the replacements to operate. This method also includes language specific replacements. Supported languages are: fra, cmn, jpn, yue, eng, ita, spa, khm, cat, pol. @param utt (list) the utterance @return A list of strings """ # Specific case of float numbers sent = ' '.join(utt) sent = re.sub(u'([0-9])\.([0-9])', ur'\1 NUMBER_SEP_POINT \2', sent) sent = re.sub(u'([0-9])\,([0-9])', ur'\1 NUMBER_SEP \2', sent) sent = rutils.ToStrip( sent ) _utt = sent.split() # Other generic replacements _result = [] for s in _utt: if self.repl.is_key( s ): s = s.replace(s, self.repl.replace(s)) _result.append(rutils.ToStrip( s )) return _result # End replace # ----------------------------------------------------------------------- def compound(self, utt): """ Examine tokens containing - or ' and split depending on rules. Language independent. @param utt (list) the utterance @return A list of strings """ _utt = [] for tok in utt: # a missing compound word? # --> an unknown token # --> containing a special character # --> that is not a truncated word! if self.vocab.is_unk(tok.lower().strip()) is True and (tok.find("-")>-1 or tok.find("'")>-1 or tok.find(".")>-1) and not tok.endswith('-'): # Split the unknown token into a list # KEEP special chars ('-.) in the array! _tabtoks = re.split("([-'.])",tok) # Explore the list from left to right t1 = 0 while t1<len(_tabtoks): i = len(_tabtoks) i_ok = 0 # Find the longest string in the dict while i>=t1 and i_ok==0: _token = _tabtoks[t1] if i > (t1+1): for j in range(t1+1,i): _token += _tabtoks[j] if self.vocab.is_unk(_token) is False: i_ok = j+1 else: i_ok = 1 _token = _tabtoks[t1] i -= 1 t1 += i_ok _utt.append( rutils.ToStrip( _token )) else: _utt.append( rutils.ToStrip( tok )) return _utt # End compound # ------------------------------------------------------------------ def lower(self, utt ): """ Lower a list of strings. @param utt (list) """ _utt = [] for tok in utt: if "/" not in tok: _utt.append( rutils.ToLower( tok )) else: _utt.append( tok ) return _utt # End lower # ------------------------------------------------------------------ def remove(self, utt, wlist): """ Remove data of an utterance if included in a dictionary. Only used to remove punctuation. @param entry @param wlist (WordList) """ _utt = [] for tok in utt: if wlist.is_unk(tok) is True and "gpd_" not in tok and "ipu_" not in tok: _utt.append( tok ) return _utt # End remove # ------------------------------------------------------------------ # ------------------------------------------------------------------ # EOT specific modules # ------------------------------------------------------------------ def __repl(self, obj): """ Callback for clean_toe. @param obj (MatchObject) @return string """ # Left part # Remove parentheses left = obj.group(1).replace('(', '') left = left.replace(')', '') # Replace spaces with underscores left = "_".join(left.split()) # Right part # Remove spaces right = obj.group(2) right = "".join(right.split()) return " [%s,%s]" % (left, right) def clean_toe(self, entry): """ Clean Enriched Orthographic Transcription. The convention includes information that must be removed. @param entry (string) @return string """ # Proper names: $ name ,P\$ entry = re.sub(u',\s?[PTS]+\s?[\\/\\\]+\s?\\$', ur'', entry, re.UNICODE) entry = re.sub(ur'\$', ur'', entry, re.UNICODE) entry = re.sub(u'(gpd_[0-9]+)', ur"\1 ", entry, re.UNICODE) entry = re.sub(u'(ipu_[0-9]+)', ur"\1 ", entry, re.UNICODE) # Remove invalid parenthesis content entry = re.sub(ur'\s+\([\w\xaa-\xff]+\)\s+', ' ', entry, re.UNICODE) entry = re.sub(ur'^\([\w\xaa-\xff]+\)\s+', ' ', entry, re.UNICODE) entry = re.sub(ur'\s+\([\w\xaa-\xff]+\)$', ' ', entry, re.UNICODE) entry = re.sub(ur'\s*\[([^,]+),([^,]+)\]', self.__repl, entry, re.UNICODE) return " ".join(entry.split()) # End clean_toe and __repl # ------------------------------------------------------------------ def toe_spelling(self, entry, std=False): """ Create a specific spelling from an Enriched Orthographic Transcription. @param entry (string): the EOT string @return a string. DevNote: Python’s regular expression engine supports Unicode. It can apply the same pattern to either 8-bit (encoded) or Unicode strings. To create a regular expression pattern that uses Unicode character classes for \w (and \s, and \b), use the “(?u)” flag prefix, or the re.UNICODE flag. """ # Ensure all regexp will work! _fentry = " " + unicode(entry) + " " if std is False: # Stick unregular Liaisons to the previous token _fentry = re.sub(u' =([\w]+)=', ur'-\1', _fentry, re.UNICODE) else: # Remove Liaisons _fentry = re.sub(u' =([\w]+)=', ur' ', _fentry, re.UNICODE) # Laughing sequences _fentry = re.sub(u"\s?@\s?@\s?", u" ", _fentry, re.UNICODE) # Laughing _fentry = re.sub(u"([\w\xaa-\xff]+)@", ur"\1 @", _fentry, re.UNICODE) _fentry = re.sub(u"@([\w\xaa-\xff]+)", ur"@ \1", _fentry, re.UNICODE) # Noises _fentry = re.sub(u"([\w\xaa-\xff]+)\*", ur"\1 *", _fentry, re.UNICODE) _fentry = re.sub(u"\*([\w\xaa-\xff]+)", ur"* \1", _fentry, re.UNICODE) # Transcriptor comment's: {comment} _fentry = re.sub(u'\\{[\s\w\xaa-\xff\-:]+\\}', ur'', _fentry, re.UNICODE) # Transcriptor comment's: [comment] _fentry = re.sub(u'\\[[\s\w\xaa-\xff\-:]+\\]', ur'', _fentry, re.UNICODE) # Transcription comment's: (comment) _fentry = re.sub(u' \\([\s\w\xaa-\xff\-:]+\\) ', ur'', _fentry, re.UNICODE) # .... warning! if std is False: # Special elisions (remove parenthesis content) _fentry = re.sub(u'\\([\s\w\xaa-\xff\-\']+\\)', ur'', _fentry, re.UNICODE) else: # Special elisions (keep parenthesis content) _fentry = re.sub(u'\\(([\s\w\xaa-\xff\-]+)\\)', ur'\1', _fentry, re.UNICODE) # Morphological variants are ignored for phonetization (same pronunciation!) _fentry = re.sub(u'\s+\\<([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\>', ur' \1', _fentry, re.UNICODE) _fentry = re.sub(u'\s+\\{([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\}', ur' \1', _fentry, re.UNICODE) _fentry = re.sub(u'\s+\\/([\-\'\s\w0-9\xaa-\xff]+),[\-\'\s\w0-9\xaa-\xff]+\\/', ur' \1', _fentry, re.UNICODE) if std is False: # Special pronunciations (keep right part) _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff/-]+),([\s\w\xaa-\xff/]+)\\]', ur' \2', _fentry, re.UNICODE) else: # Special pronunciations (keep left part) _fentry = re.sub(u'\s+\\[([\s\w\xaa-\xff\\/-]+),[\s\w\xaa-\xff\\/]+\\]', ur' \1', _fentry, re.UNICODE) # Proper names: $ name ,P\$ _fentry = re.sub(u',\s?[PTS]+\s?[\\/\\\]+\s?\\$', ur'', _fentry, re.UNICODE) _fentry = re.sub(u'\\$', ur'', _fentry, re.UNICODE) # Add a space if some punctuation are sticked to a word # TODO: do the same with the whole list of punctuations (in rutils). # _fentry = re.sub(u'([:+^@}\(\){~|=]+)([\xaa-\xff]+)', ur'\1 \2', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+),', ur'\1 ,', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)\+', ur'\1 +', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+);', ur'\1 ,', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+):', ur'\1 :', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)\(', ur'\1 (', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)\)', ur'\1 )', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)\{', ur'\1 {', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)\}', ur'\1 }', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)=', ur'\1 =', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)\?', ur'\1 ?', _fentry, re.UNICODE) _fentry = re.sub(u'([\w\xaa-\xff]+)\!', ur'\1 !', _fentry, re.UNICODE) #_fentry = re.sub(u'([\w\xaa-\xff]+)\/', ur'\1 !', _fentry, re.UNICODE) # no: if sampa in special pron. _fentry = re.sub(u"\s(?=,[0-9]+)", "" , _fentry, re.UNICODE) # Correction of errors s = "" inpron=False for c in _fentry: if c == "/": inpron = not inpron else: if c == " " and inpron is True: continue s += c return rutils.ToStrip(s) # End toe_spelling # ------------------------------------------------------------------ # ------------------------------------------------------------------ # The main tokenize is HERE! # ------------------------------------------------------------------ def tokenize_list(self, utt, std=False): """ Tokenize from a list of entries. """ # Step 2: replace try: utt = self.replace( utt ) except IOError: # repl file not found pass except Exception as e: raise Exception(" *in replace* "+str(e)+'\n') # Step 3: compound try: utt = self.compound( utt ) except Exception as e: raise Exception(" *in compound* "+str(e)+'\n') # Step 4: stick (using the dictionary) try: attachement = "_" if (self.lang=="cmn" or self.lang == "jpn" or self.lang == "yue"): attachement = "" utt = self.stick( utt,attachement ) except Exception as e: raise Exception(" *in stick* "+str(e)+'\n') # Step 5: num2letter try: _utt = [] for i in utt: if not "/" in utt: _utt.append( self.num2letter.convert( i ) ) else: _utt.append( i ) utt = _utt except Exception as e: pass # Step 6: lower try: utt = self.lower( utt ) except Exception as e: raise Exception(" *in lower* "+str(e)+'\n') # Step 7: remove (punctuation) try: utt = self.remove( utt,self.punct ) except Exception as e: raise Exception(" *in remove* "+str(e)+'\n') # Finally, prepare the result strres = "" for s in utt: s = rutils.ToStrip( s ) strres = strres + u" " + s.replace(u" ",u"_") strres = rutils.ToStrip(strres) if len(strres)==0: return "#" # or "dummy" ??? return strres.replace(u" ", self.delimiter) def tokenize(self, entry, std=False): """ Tokenize an utterrance. @param entry (UTF8-String) is the utterrance (the transcription) @param std (Boolean) In case of enriched transcription, std is used to fix the output as standard or faked spelling @return A string (the tokenized transcription) **TODO: disable TOE_CLEAN for written text** """ # THE ENTRY (a transcription, a text...) IS A UTF8-STRING # ------------------------------------------------------- _str = rutils.ToStrip( entry ) # Remove UTF-8 specific characters that are not in our dictionaries! try: for key in self.dicoutf.get_keys(): _str = _str.replace( key, self.dicoutf.replace(key) ) except Exception as e: raise UnicodeError('Error during cleaning: %s'%str(e)) # Enriched Orthographic Transcription # Create a faked spelling (default) or a standard spelling _str = self.clean_toe(_str) _str = self.toe_spelling(_str, std) # Step 1: split using spaces (or characters for asian languages) try: utt = self.split( _str, std ) except Exception as e: raise Exception(" *in split* "+str(e)) # THE ENTRY IS NOW A LIST OF STRINGS. # --------------------------------------------------- return self.tokenize_list(utt, std)
class sppasRepetition( ): """ SPPAS Automatic Repetition Detection (either self-repetitions or other-repetitions). This annotation is performed on the basis of aligned-tokens. The tokens can be lemmatized from a dictionary. The output is made of 2 tiers including intervals with sources and echos. How to use sppasRepetition? >>> p = sppasRepetition( dictpath, lang ) >>> p.run(inputtrsname, outputfilename) """ def __init__(self, resourcefile, logfile=None): """ Create a new sppasRepetition instance. @param resourcefile is either the lemma dictionary or the list of stop-words. Attention: the extention of the resource file name is very important: must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)! """ # Members self._merge = False # Merge input in the output self._use_lemmatize = True # Lemmatize the input self._use_stopwords = True # Add specific stopwords of the input self._empan = 5 # Detection length (nb of IPUs; 1=current IPU) self._alpha = 0.5 # Specific stop-words threshold coefficient self.logfile = logfile self.lemmatizer = None self.stopwords = None # Create the lemmatizer instance try: lemmafile = resourcefile.replace(".stp", ".lem") self.lemmatizer = LemmaDict(lemmafile) except Exception: self._use_lemmatize = False if (self._use_lemmatize is True and self.lemmatizer.get_size() == 0) or self._use_lemmatize is False: if logfile is not None: logfile.print_message("Lemmatization disabled.",indent=2,status=3) else: print " ... ... [ INFO ] Lemmatization disabled." self._use_lemmatize = False # Create the list of stop words (list of non-relevant words) try: stopfile = resourcefile.replace(".lem", ".stp") self.stopwords = WordsList(filename=resourcefile, nodump=True) if self.stopwords.get_size() == 0: self._use_stopwords = False except Exception: self.stopwords = WordsList() #if (self._use_stopwords is True and self.stopwords.get_size() == 0) or self._use_stopwords is False: if self._use_stopwords is False: if logfile is not None: logfile.print_message("StopWords disabled.",indent=2,status=3) else: print " ... ... [ INFO ] StopWords disabled." #self._use_stopwords = False # End __init__ # ------------------------------------------------------------------ def fix_options(self, options): for opt in options: if "merge" == opt.get_key(): self.set_merge( opt.get_value() ) elif "stopwords" == opt.get_key(): self.set_use_stopwords( opt.get_value() ) elif "lemmatize" == opt.get_key(): self.set_use_lemmatize( opt.get_value() ) elif "empan" == opt.get_key(): self.set_empan( opt.get_value() ) elif "alpha" == opt.get_key(): self.set_alpha( opt.get_value() ) # End fix_options # ------------------------------------------------------------------ # ###################################################################### # # Getters and Setters # # ###################################################################### # def set_merge(self, merge): """ Fix the merge option. If merge is set to True, sppasRepetition() will save the input tiers in the output file. @param merge (Boolean) """ self._merge = merge # End set_merge # ---------------------------------------------------------------------- def set_use_lemmatize(self, use_lemmatize): """ Fix the use_lemmatize option. If use_lemmatize is set to True, sppasRepetition() will lemmatize the input before the repetition automatic detection. @param use_lemmatize (Boolean) """ self._use_lemmatize = use_lemmatize # End set_use_lemmatize # ---------------------------------------------------------------------- def set_use_stopwords(self, use_stopwords): """ Fix the use_stopwords option. If use_stopwords is set to True, sppasRepetition() will add specific stopwords to the stopwords list (deducted from the input text). @param use_stopwords (Boolean) """ self._use_stopwords = use_stopwords # End set_use_stopwords # ---------------------------------------------------------------------- def set_empan(self, empan): """ Fix the empan option. @param empan (int) """ self._empan = empan # End set_empan # ---------------------------------------------------------------------- def set_alpha(self, alpha): """ Fix the alpha option. @param alpha (int or float) """ self._alpha = alpha # End set_alpha # ---------------------------------------------------------------------- # ###################################################################### # # Automatic Detection search parameters # # ###################################################################### # def lemmatize(self, inputtier): """ Lemmatize a tier. @param inputtier (Tier) """ if self._use_lemmatize is False: return lemmatier = inputtier.Copy() for i in range(lemmatier.GetSize()): lem = self.lemmatizer.get_lem( lemmatier[i].GetLabel().GetValue() ) lemmatier[i].GetLabel().SetValue( lem ) return lemmatier # ------------------------------------------------------------------ def relevancy(self, inputtier): """ Add very frequent tokens in a copy of the stopwords list. Return a WordsList instance Estimate the relevance of each term by using the number of occurrences of this term in the input and compare this value to a threshold, to add the term (or not) in the stopwords list. @param inputtier (Tier) """ l = self.stopwords.copy() # Create the Unigram and put data u = Unigram() for a in inputtier: if a.GetLabel().IsSpeech() is True: u.add( a.GetLabel().GetValue() ) # Estimate if a token is relevant, put in the stoplist for token in u.get_tokens(): freq = u.get_value(token) proba = float(freq) / float(u.get_sum()) relevant = 1.0 / (float(u.get_size())*float(self._alpha)) if proba > relevant: l.add( token ) if self.logfile is not None: self.logfile.print_message('Add in the stoplist: '+token, indent=3) elif DEBUG is True: print(' ... ... ... Add in the stoplist: '+token.encode('utf8')) return l # End relevancy # ------------------------------------------------------------------ def find_next_break (self, inputtier, start, empan): """ Return the index of the next interval representing a break. This depends on the 'empan' value. @param start is the position of the token where the search will start """ nbbreaks = 0 for i in range (start, inputtier.GetSize()): if inputtier[i].GetLabel().IsSilence(): nbbreaks = nbbreaks + 1 if nbbreaks == empan: return i return inputtier.GetSize() - 1 # End find_next_break # ------------------------------------------------------------------ # ###################################################################### # # Automatic Detection search # # ###################################################################### # def _addrepetition(self, repeatobj, nbrepeats, inputtier1, inputtier2, tokstartsrc, tokstartrep, srctier, reptier): """ Add sources and repetitions from repeatobj to the tiers. """ n = 0 for i in range(repeatobj.get_repeats_size()): # Source s,e = repeatobj.get_repeat_source(i) srcbegin = inputtier1[tokstartsrc+s].GetLocation().GetBegin() srcend = inputtier1[tokstartsrc+e].GetLocation().GetEnd() time = TimeInterval(srcbegin.Copy(), srcend.Copy()) srcann = Annotation(time, Label("S"+str(nbrepeats+n))) try: srctier.Add(srcann) if DEBUG: print "src annotation added: ",srcann except Exception: continue # Repetition rep = repeatobj.get_repeat_repetition(i) for r in rep: (s,e) = r repbegin = inputtier2[tokstartrep+s].GetLocation().GetBegin() repend = inputtier2[tokstartrep+e].GetLocation().GetEnd() r = reptier.Lindex(repbegin) #time) l = reptier.Rindex(repend) #time) # all other cases (no repetition, overlap) time = TimeInterval( repbegin.Copy(), repend.Copy() ) repann = Annotation(time, Label("R"+str(nbrepeats+n))) reptier.Add(repann) if DEBUG: print "rep annotation added: ",repann n = n + 1 # end for return n def selfdetection(self, inputtier1): """ Self-Repetition detection. @param inputtier1 (Tier) """ # Verifications: is there any data? if inputtier1.IsEmpty() is True: raise Exception("Repetition. Empty input tokens tier.\n") # Update the stoplist if self._use_stopwords is True: stpw = self.relevancy( inputtier1 ) else: stpw = self.stopwords # Create repeat objects repeatobj = Repetitions( ) # Create output data srctier = Tier("Sources") reptier = Tier("Repetitions") nbrepeats = 1 # Initialization of tokstart and tokend tokstart = 0 if inputtier1[0].GetLabel().IsDummy(): tokstart = 1 toksearch = self.find_next_break( inputtier1, tokstart+1 , empan=1) tokend = self.find_next_break( inputtier1, tokstart+1 , empan=self._empan) # Detection is here: while tokstart < tokend: # Build an array with the tokens tokens1 = list() for i in range(tokstart, tokend+1): tokens1.append( inputtier1[i].GetLabel().GetValue() ) speaker1 = DataSpeaker( tokens1, stpw ) # Detect repeats in these data repeatobj.detect( speaker1, toksearch-tokstart, None ) # Save repeats if repeatobj.get_repeats_size()>0: n = self._addrepetition(repeatobj, nbrepeats, inputtier1, inputtier1, tokstart, tokstart, srctier, reptier) nbrepeats = nbrepeats + n # Prepare next search tokstart = toksearch toksearch = self.find_next_break( inputtier1 , tokstart+1 , empan=1 ) tokend = self.find_next_break( inputtier1 , tokstart+1 , empan=self._empan ) return (srctier,reptier) # End selfdetection # ------------------------------------------------------------------------ def otherdetection(self, inputtier1, inputtier2): """ Other-Repetition detection. @param inputtier (Tier) """ # Verifications: is there any data? if inputtier1.IsEmpty() is True: raise Exception("Repetition. Empty input tokens tier.\n") # Update the stoplist if self._use_stopwords is True: # other-repetition: relevance of the echoing-speaker stpw = self.relevancy( inputtier2 ) else: stpw = self.stopwords # Create repeat objects repeatobj = Repetitions( ) # Create output data srctier = Tier("OR-Source") reptier = Tier("OR-Repetition") nbrepeats = 1 # Initialization of tokstart, and tokend tokstartsrc = 0 if inputtier1[0].GetLabel().IsDummy(): tokstartsrc = 1 tokendsrc = min(20, inputtier1.GetSize()-1) # Detection is here: # detect() is applied work by word, from tokstart to tokend while tokstartsrc < tokendsrc: # Build an array with the tokens tokens1 = list() for i in range(tokstartsrc, tokendsrc): tokens1.append( inputtier1[i].GetLabel().GetValue() ) speaker1 = DataSpeaker( tokens1, stpw ) # Create speaker2 tokens2 = list() nbbreaks = 0 tokstartrep = -1 a = inputtier1[tokstartsrc] for (r,b) in enumerate(inputtier2): if b.GetLocation().GetBeginMidpoint() >= a.GetLocation().GetBeginMidpoint(): if tokstartrep == -1: tokstartrep = r if b.GetLabel().IsSilence(): nbbreaks = nbbreaks + 1 if nbbreaks == self._empan: break tokens2.append( b.GetLabel().GetValue() ) speaker2 = DataSpeaker( tokens2, stpw ) if DEBUG is True: print "SRC : ",speaker1 print "ECHO: ",speaker2 # Detect repeats in these data: search if the first token of spk1 # is the first token of a source. repeatobj.detect( speaker1, 1, speaker2 ) # Save repeats shift = 1 if repeatobj.get_repeats_size()>0: if DEBUG is True: print " ----> found : " repeatobj.get_repeat(0).print_echo() s,e = repeatobj.get_repeat_source(0) n = self._addrepetition(repeatobj, nbrepeats, inputtier1, inputtier2, tokstartsrc, tokstartrep, srctier, reptier) if n > 0: nbrepeats = nbrepeats + n shift = e + 1 while speaker1.is_token(speaker1.get_token(shift)) is False and shift < 20: shift = shift + 1 tokstartsrc = tokstartsrc + shift tokstartsrc = min(tokstartsrc, inputtier1.GetSize()-1) tokendsrc = min(tokstartsrc + 20, inputtier1.GetSize()-1) return (srctier,reptier) # End otherdetection # ------------------------------------------------------------------------ # ###################################################################### # # Run # ###################################################################### # def run(self, inputfilename1, inputfilename2=None, outputfilename=None): """ Run the Repetition Automatic Detection annotation. @param inputfilename @param outputfilename """ tokentier1 = None # First tier tokentier2 = -1 # No echoing speaker try: # Find the token tier trsinput1 = annotationdata.io.read( inputfilename1 ) for i in range( trsinput1.GetSize() ): if "token" in trsinput1[i].GetName().lower() and "align" in trsinput1[i].GetName().lower(): tokentier1 = i break if inputfilename2 is not None: #find the token tier trsinput2 = annotationdata.io.read( inputfilename2 ) for i in range( trsinput2.GetSize() ): if "token" in trsinput2[i].GetName().lower() and "align" in trsinput2[i].GetName().lower(): tokentier2 = i break except Exception as e: raise Exception('Repetitions. '+str(e)) if tokentier1 is None: raise Exception('Repetitions. No valid input tier (expected: TokensAlign).') # Lemmatize input? if self._use_lemmatize is True and self.lemmatizer: tier1 = self.lemmatize( trsinput1[tokentier1] ) if tokentier2 > -1: tier2 = self.lemmatize( trsinput2[tokentier2] ) else: tier1 = trsinput1[tokentier1] if tokentier2 > -1: tier2 = trsinput2[tokentier2] if self.logfile is not None: self.logfile.print_message("Empan = "+str(self._empan), indent=3) self.logfile.print_message("Alpha = "+str(self._alpha), indent=3) # Repetition Automatic Detection if tokentier2 == -1: (srctier,reptier) = self.selfdetection( tier1 ) else: (srctier,reptier) = self.otherdetection( tier1 , tier2 ) # Manage results: # An output file name is given if outputfilename: trsoutput = Transcription("Repetitions") if self._merge is True: for i in range( trsinput1.GetSize() ): trsoutput.Add( trsinput1[i] ) # the repeat tier is added to the input transcription else: outputfilename = inputfilename1 trsoutput = annotationdata.io.read( inputfilename1 ) # Add repeats to this trsoutput trsoutput.Append( srctier ) trsoutput.Append( reptier ) trsoutput.SetMinTime( trsinput1.GetMinTime() ) trsoutput.SetMaxTime( trsinput1.GetMaxTime() ) # hum, in case of OR... not sure! to be verified. # Save annotationdata.io.write( outputfilename, trsoutput )
def test_ita(self): l = WordsList( ITA ) self.assertTrue( l.is_unk('toto') ) self.assertFalse( l.is_unk(u'perché') )