Beispiel #1
0
    def test_align(self):
        dictdir  = os.path.join(SPPAS, "resources", "vocab")
        vocabfile = os.path.join(dictdir, "FR.vocab")
        tok = sppasTok(vocabfile, "FR")
        tier = Tier()
        lines = (
                 u"pa(r)ce que j'ai euh",
                 u"un p(e)tit peu",
                 u"[i(l)s, iz] ont pas d(e) culture",
                 u"d'aut(re)",
                 u"(e)st-ce qu'elle a l'air bien ou pas",
                 u"p(eu)t-êt(re) moins évident",
                 u"[pa(r)ce que, passe] c'est euh",
                 u"t(out) ça",
                 u"j'(ai) euh",
                 u"[entre-elles, entrèl]"
                 )
        for i, line in enumerate(lines):
            a = Annotation(TimeInterval(TimePoint(i), TimePoint(i+1)),
                           Label(line))
            tier.Append(a)

        faked, std = tok.convert(tier)
        tok.align(std, faked)

        self.assertEqual(std[0].TextValue, u"parce_que j' ai euh")
        self.assertEqual(faked[0].TextValue, u"pace_que j' ai euh")

        self.assertEqual(std[1].TextValue, u"un_petit_peu")
        self.assertEqual(faked[1].TextValue, u"un_ptit_peu")

        self.assertEqual(std[2].TextValue, u"ils_ont pas de culture")
        self.assertEqual(faked[2].TextValue, u"iz ont_pas d culture")

        self.assertEqual(std[3].TextValue, u"d'autre")
        self.assertEqual(faked[3].TextValue, u"d'aut")

        self.assertEqual(std[4].TextValue,u"est-ce_qu' elle a l' air bien ou pas")
        self.assertEqual(faked[4].TextValue, u"st-ce_qu' elle a l' air bien ou pas")

        self.assertEqual(std[5].TextValue, u"peut-être moins évident")
        self.assertEqual(faked[5].TextValue, u"ptêt moins évident")

        self.assertEqual(std[6].TextValue, u"parce_que c'est euh")
        self.assertEqual(faked[6].TextValue, u"passe c'est euh")

        self.assertEqual(std[7].TextValue, u"tout_ça")
        self.assertEqual(faked[7].TextValue, u"t_ça")

        self.assertEqual(std[8].TextValue, u"j' euh")
        self.assertEqual(faked[8].TextValue, u"j' euh")
Beispiel #2
0
    def test_align(self):
        dictdir = os.path.join(SPPAS, "resources", "vocab")
        vocabfile = os.path.join(dictdir, "FR.vocab")
        tok = sppasTok(vocabfile, "FR")
        tier = Tier()
        lines = (u"pa(r)ce que j'ai euh", u"un p(e)tit peu",
                 u"[i(l)s, iz] ont pas d(e) culture", u"d'aut(re)",
                 u"(e)st-ce qu'elle a l'air bien ou pas",
                 u"p(eu)t-êt(re) moins évident",
                 u"[pa(r)ce que, passe] c'est euh", u"t(out) ça",
                 u"j'(ai) euh", u"[entre-elles, entrèl]")
        for i, line in enumerate(lines):
            a = Annotation(TimeInterval(TimePoint(i), TimePoint(i + 1)),
                           Label(line))
            tier.Append(a)

        faked, std = tok.convert(tier)
        tok.align(std, faked)

        self.assertEqual(std[0].TextValue, u"parce_que j' ai euh")
        self.assertEqual(faked[0].TextValue, u"pace_que j' ai euh")

        self.assertEqual(std[1].TextValue, u"un_petit_peu")
        self.assertEqual(faked[1].TextValue, u"un_ptit_peu")

        self.assertEqual(std[2].TextValue, u"ils_ont pas de culture")
        self.assertEqual(faked[2].TextValue, u"iz ont_pas d culture")

        self.assertEqual(std[3].TextValue, u"d'autre")
        self.assertEqual(faked[3].TextValue, u"d'aut")

        self.assertEqual(std[4].TextValue,
                         u"est-ce_qu' elle a l' air bien ou pas")
        self.assertEqual(faked[4].TextValue,
                         u"st-ce_qu' elle a l' air bien ou pas")

        self.assertEqual(std[5].TextValue, u"peut-être moins évident")
        self.assertEqual(faked[5].TextValue, u"ptêt moins évident")

        self.assertEqual(std[6].TextValue, u"parce_que c'est euh")
        self.assertEqual(faked[6].TextValue, u"passe c'est euh")

        self.assertEqual(std[7].TextValue, u"tout_ça")
        self.assertEqual(faked[7].TextValue, u"t_ça")

        self.assertEqual(std[8].TextValue, u"j' euh")
        self.assertEqual(faked[8].TextValue, u"j' euh")
Beispiel #3
0
    def run_tokenization(self, stepidx):
        """
        Execute the SPPAS-Tokenization program.

        @return number of files processed successfully

        """
        # Initializations
        step = self.parameters.get_step(stepidx)
        stepname = self.parameters.get_step_name(stepidx)
        files_processed_success = 0
        self._progress.set_header(stepname)
        self._progress.update(0,"")

        # Get the list of input file names, with the ".wav" (or ".wave") extension
        filelist = self.set_filelist(".wav",not_start=["track_"])
        if len(filelist) == 0:
            return 0
        total = len(filelist)

        # Create annotation instance
        try:
            self._progress.set_text("Loading resources...")
            t = sppasTok( step.get_langresource(), logfile=self._logfile, lang=step.get_lang() )
        except Exception as e:
            if self._logfile is not None:
                self._logfile.print_message( "%s\n"%str(e), indent=1,status=1 )
            return 0

        # Execute the annotation for each file in the list
        for i,f in enumerate(filelist):

            # fix the default values
            t.fix_options( step.get_options() )

            # Indicate the file to be processed
            self._progress.set_text( os.path.basename(f)+" ("+str(i+1)+"/"+str(total)+")" )
            if self._logfile is not None:
                self._logfile.print_message(stepname+" of file " + f, indent=1 )

            # Get the input file
            inname = self._get_filename(f, [self.parameters.get_output_format()] + annotationdata.io.extensions_out_multitiers) # '.xra', '.TextGrid', '.eaf', '.trs', '.csv', '.mrk'])
            if inname is not None:

                # Fix output file name
                outname = os.path.splitext(f)[0] + '-token' + self.parameters.get_output_format()

                # Execute annotation
                try:
                    t.run( inname, outputfile=outname )
                except Exception as e:
                    if self._logfile is not None:
                        self._logfile.print_message( "%s for file %s\n"%(str(e),outname), indent=2,status=-1 )
                else:
                    files_processed_success += 1
                    if self._logfile is not None:
                        self._logfile.print_message(outname, indent=2,status=0 )

            else:
                if self._logfile is not None:
                    self._logfile.print_message("Failed to find a file with transcription. Read the documentation for details.",indent=2,status=2)

            # Indicate progress
            self._progress.set_fraction(float((i+1))/float(total))
            if self._logfile is not None:
                self._logfile.print_newline()

        # Indicate completed!
        self._progress.update(1,"Completed (%d files successfully over %d files).\n"%(files_processed_success,total))
        self._progress.set_header("")

        return files_processed_success
Beispiel #4
0
args = parser.parse_args()


# ----------------------------------------------------------------------------
# Automatic Tokenization is here:
# ----------------------------------------------------------------------------

base = os.path.basename( args.vocab )
lang = base[:3]

delim = ' '
if args.delimiter:
    delim = unicode(args.delimiter)

if args.i:
    p = sppasTok( args.vocab,lang )
    p.tokenizer.set_delim( delim )
    p.run( args.i, args.o )

else:

    vocab = WordsList( args.vocab )
    tokenizer = DictTok( vocab,lang )

    try:
        repl = DictRepl(os.path.join(RESOURCES_PATH, "repl", lang + ".repl"), nodump=True)
        tokenizer.set_repl( repl )
    except Exception as e:
        print "[warning] No replacement dictionary: ",str(e)
    try:
        punct = WordsList(os.path.join(RESOURCES_PATH, "vocab", "Punctuations.txt"), nodump=True )