def test_align(self): dictdir = os.path.join(SPPAS, "resources", "vocab") vocabfile = os.path.join(dictdir, "FR.vocab") tok = sppasTok(vocabfile, "FR") tier = Tier() lines = ( u"pa(r)ce que j'ai euh", u"un p(e)tit peu", u"[i(l)s, iz] ont pas d(e) culture", u"d'aut(re)", u"(e)st-ce qu'elle a l'air bien ou pas", u"p(eu)t-êt(re) moins évident", u"[pa(r)ce que, passe] c'est euh", u"t(out) ça", u"j'(ai) euh", u"[entre-elles, entrèl]" ) for i, line in enumerate(lines): a = Annotation(TimeInterval(TimePoint(i), TimePoint(i+1)), Label(line)) tier.Append(a) faked, std = tok.convert(tier) tok.align(std, faked) self.assertEqual(std[0].TextValue, u"parce_que j' ai euh") self.assertEqual(faked[0].TextValue, u"pace_que j' ai euh") self.assertEqual(std[1].TextValue, u"un_petit_peu") self.assertEqual(faked[1].TextValue, u"un_ptit_peu") self.assertEqual(std[2].TextValue, u"ils_ont pas de culture") self.assertEqual(faked[2].TextValue, u"iz ont_pas d culture") self.assertEqual(std[3].TextValue, u"d'autre") self.assertEqual(faked[3].TextValue, u"d'aut") self.assertEqual(std[4].TextValue,u"est-ce_qu' elle a l' air bien ou pas") self.assertEqual(faked[4].TextValue, u"st-ce_qu' elle a l' air bien ou pas") self.assertEqual(std[5].TextValue, u"peut-être moins évident") self.assertEqual(faked[5].TextValue, u"ptêt moins évident") self.assertEqual(std[6].TextValue, u"parce_que c'est euh") self.assertEqual(faked[6].TextValue, u"passe c'est euh") self.assertEqual(std[7].TextValue, u"tout_ça") self.assertEqual(faked[7].TextValue, u"t_ça") self.assertEqual(std[8].TextValue, u"j' euh") self.assertEqual(faked[8].TextValue, u"j' euh")
def test_align(self): dictdir = os.path.join(SPPAS, "resources", "vocab") vocabfile = os.path.join(dictdir, "FR.vocab") tok = sppasTok(vocabfile, "FR") tier = Tier() lines = (u"pa(r)ce que j'ai euh", u"un p(e)tit peu", u"[i(l)s, iz] ont pas d(e) culture", u"d'aut(re)", u"(e)st-ce qu'elle a l'air bien ou pas", u"p(eu)t-êt(re) moins évident", u"[pa(r)ce que, passe] c'est euh", u"t(out) ça", u"j'(ai) euh", u"[entre-elles, entrèl]") for i, line in enumerate(lines): a = Annotation(TimeInterval(TimePoint(i), TimePoint(i + 1)), Label(line)) tier.Append(a) faked, std = tok.convert(tier) tok.align(std, faked) self.assertEqual(std[0].TextValue, u"parce_que j' ai euh") self.assertEqual(faked[0].TextValue, u"pace_que j' ai euh") self.assertEqual(std[1].TextValue, u"un_petit_peu") self.assertEqual(faked[1].TextValue, u"un_ptit_peu") self.assertEqual(std[2].TextValue, u"ils_ont pas de culture") self.assertEqual(faked[2].TextValue, u"iz ont_pas d culture") self.assertEqual(std[3].TextValue, u"d'autre") self.assertEqual(faked[3].TextValue, u"d'aut") self.assertEqual(std[4].TextValue, u"est-ce_qu' elle a l' air bien ou pas") self.assertEqual(faked[4].TextValue, u"st-ce_qu' elle a l' air bien ou pas") self.assertEqual(std[5].TextValue, u"peut-être moins évident") self.assertEqual(faked[5].TextValue, u"ptêt moins évident") self.assertEqual(std[6].TextValue, u"parce_que c'est euh") self.assertEqual(faked[6].TextValue, u"passe c'est euh") self.assertEqual(std[7].TextValue, u"tout_ça") self.assertEqual(faked[7].TextValue, u"t_ça") self.assertEqual(std[8].TextValue, u"j' euh") self.assertEqual(faked[8].TextValue, u"j' euh")
def run_tokenization(self, stepidx): """ Execute the SPPAS-Tokenization program. @return number of files processed successfully """ # Initializations step = self.parameters.get_step(stepidx) stepname = self.parameters.get_step_name(stepidx) files_processed_success = 0 self._progress.set_header(stepname) self._progress.update(0,"") # Get the list of input file names, with the ".wav" (or ".wave") extension filelist = self.set_filelist(".wav",not_start=["track_"]) if len(filelist) == 0: return 0 total = len(filelist) # Create annotation instance try: self._progress.set_text("Loading resources...") t = sppasTok( step.get_langresource(), logfile=self._logfile, lang=step.get_lang() ) except Exception as e: if self._logfile is not None: self._logfile.print_message( "%s\n"%str(e), indent=1,status=1 ) return 0 # Execute the annotation for each file in the list for i,f in enumerate(filelist): # fix the default values t.fix_options( step.get_options() ) # Indicate the file to be processed self._progress.set_text( os.path.basename(f)+" ("+str(i+1)+"/"+str(total)+")" ) if self._logfile is not None: self._logfile.print_message(stepname+" of file " + f, indent=1 ) # Get the input file inname = self._get_filename(f, [self.parameters.get_output_format()] + annotationdata.io.extensions_out_multitiers) # '.xra', '.TextGrid', '.eaf', '.trs', '.csv', '.mrk']) if inname is not None: # Fix output file name outname = os.path.splitext(f)[0] + '-token' + self.parameters.get_output_format() # Execute annotation try: t.run( inname, outputfile=outname ) except Exception as e: if self._logfile is not None: self._logfile.print_message( "%s for file %s\n"%(str(e),outname), indent=2,status=-1 ) else: files_processed_success += 1 if self._logfile is not None: self._logfile.print_message(outname, indent=2,status=0 ) else: if self._logfile is not None: self._logfile.print_message("Failed to find a file with transcription. Read the documentation for details.",indent=2,status=2) # Indicate progress self._progress.set_fraction(float((i+1))/float(total)) if self._logfile is not None: self._logfile.print_newline() # Indicate completed! self._progress.update(1,"Completed (%d files successfully over %d files).\n"%(files_processed_success,total)) self._progress.set_header("") return files_processed_success
args = parser.parse_args() # ---------------------------------------------------------------------------- # Automatic Tokenization is here: # ---------------------------------------------------------------------------- base = os.path.basename( args.vocab ) lang = base[:3] delim = ' ' if args.delimiter: delim = unicode(args.delimiter) if args.i: p = sppasTok( args.vocab,lang ) p.tokenizer.set_delim( delim ) p.run( args.i, args.o ) else: vocab = WordsList( args.vocab ) tokenizer = DictTok( vocab,lang ) try: repl = DictRepl(os.path.join(RESOURCES_PATH, "repl", lang + ".repl"), nodump=True) tokenizer.set_repl( repl ) except Exception as e: print "[warning] No replacement dictionary: ",str(e) try: punct = WordsList(os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt"), nodump=True )