Ejemplo n.º 1
0
    def __init__(self, resourcefile, logfile=None):
        """
        Create a new sppasRepetition instance.

        @param resourcefile is either the lemma dictionary or the list of stop-words.

        Attention: the extention of the resource file name is very important:
        must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)!

        """

        # Members
        self._merge         = False  # Merge input in the output
        self._use_lemmatize = True   # Lemmatize the input
        self._use_stopwords = True   # Add specific stopwords of the input
        self._empan  = 5             # Detection length (nb of IPUs; 1=current IPU)
        self._alpha  = 0.5           # Specific stop-words threshold coefficient
        self.logfile = logfile
        self.lemmatizer = None
        self.stopwords  = None

        # Create the lemmatizer instance
        try:
            lemmafile = resourcefile.replace(".stp", ".lem")
            self.lemmatizer = LemmaDict(lemmafile)
        except Exception:
            self._use_lemmatize = False

        if (self._use_lemmatize is True and self.lemmatizer.get_size() == 0) or self._use_lemmatize is False:
            if logfile is not None:
                logfile.print_message("Lemmatization disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] Lemmatization disabled."
            self._use_lemmatize = False

        # Create the list of stop words (list of non-relevant words)
        try:
            stopfile = resourcefile.replace(".lem", ".stp")
            self.stopwords = WordsList(filename=resourcefile, nodump=True)
            if self.stopwords.get_size() == 0:
                self._use_stopwords = False
        except Exception:
            self.stopwords = WordsList()

        #if (self._use_stopwords is True and self.stopwords.get_size() == 0) or self._use_stopwords is False:
        if self._use_stopwords is False:
            if logfile is not None:
                logfile.print_message("StopWords disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] StopWords disabled."
Ejemplo n.º 2
0
class sppasRepetition( ):
    """
    SPPAS Automatic Repetition Detection
    (either self-repetitions or other-repetitions).

    This annotation is performed on the basis of aligned-tokens.
    The tokens can be lemmatized from a dictionary.

    The output is made of 2 tiers including intervals with
    sources and echos.

    How to use sppasRepetition?

    >>> p = sppasRepetition( dictpath, lang )
    >>> p.run(inputtrsname, outputfilename)

    """


    def __init__(self, resourcefile, logfile=None):
        """
        Create a new sppasRepetition instance.

        @param resourcefile is either the lemma dictionary or the list of stop-words.

        Attention: the extention of the resource file name is very important:
        must be ".stp" for stop-words and ".lem" for lemmas (case-sensitive)!

        """

        # Members
        self._merge         = False  # Merge input in the output
        self._use_lemmatize = True   # Lemmatize the input
        self._use_stopwords = True   # Add specific stopwords of the input
        self._empan  = 5             # Detection length (nb of IPUs; 1=current IPU)
        self._alpha  = 0.5           # Specific stop-words threshold coefficient
        self.logfile = logfile
        self.lemmatizer = None
        self.stopwords  = None

        # Create the lemmatizer instance
        try:
            lemmafile = resourcefile.replace(".stp", ".lem")
            self.lemmatizer = LemmaDict(lemmafile)
        except Exception:
            self._use_lemmatize = False

        if (self._use_lemmatize is True and self.lemmatizer.get_size() == 0) or self._use_lemmatize is False:
            if logfile is not None:
                logfile.print_message("Lemmatization disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] Lemmatization disabled."
            self._use_lemmatize = False

        # Create the list of stop words (list of non-relevant words)
        try:
            stopfile = resourcefile.replace(".lem", ".stp")
            self.stopwords = WordsList(filename=resourcefile, nodump=True)
            if self.stopwords.get_size() == 0:
                self._use_stopwords = False
        except Exception:
            self.stopwords = WordsList()

        #if (self._use_stopwords is True and self.stopwords.get_size() == 0) or self._use_stopwords is False:
        if self._use_stopwords is False:
            if logfile is not None:
                logfile.print_message("StopWords disabled.",indent=2,status=3)
            else:
                print " ... ... [ INFO ] StopWords disabled."
            #self._use_stopwords = False

    # End __init__
    # ------------------------------------------------------------------


    def fix_options(self, options):
        for opt in options:
            if "merge" == opt.get_key():
                self.set_merge( opt.get_value() )
            elif "stopwords" == opt.get_key():
                self.set_use_stopwords( opt.get_value() )
            elif "lemmatize" == opt.get_key():
                self.set_use_lemmatize( opt.get_value() )
            elif "empan" == opt.get_key():
                self.set_empan( opt.get_value() )
            elif "alpha" == opt.get_key():
                self.set_alpha( opt.get_value() )

    # End fix_options
    # ------------------------------------------------------------------


    # ###################################################################### #
    # Getters and Setters                                                    #
    # ###################################################################### #


    def set_merge(self, merge):
        """
        Fix the merge option.
        If merge is set to True, sppasRepetition() will save the input tiers
        in the output file.

        @param merge (Boolean)

        """
        self._merge = merge

    # End set_merge
    # ----------------------------------------------------------------------


    def set_use_lemmatize(self, use_lemmatize):
        """
        Fix the use_lemmatize option.

        If use_lemmatize is set to True, sppasRepetition() will lemmatize the
        input before the repetition automatic detection.

        @param use_lemmatize (Boolean)

        """
        self._use_lemmatize = use_lemmatize

    # End set_use_lemmatize
    # ----------------------------------------------------------------------


    def set_use_stopwords(self, use_stopwords):
        """
        Fix the use_stopwords option.

        If use_stopwords is set to True, sppasRepetition() will add specific
        stopwords to the stopwords list (deducted from the input text).

        @param use_stopwords (Boolean)

        """
        self._use_stopwords = use_stopwords

    # End set_use_stopwords
    # ----------------------------------------------------------------------


    def set_empan(self, empan):
        """
        Fix the empan option.

        @param empan (int)

        """
        self._empan = empan

    # End set_empan
    # ----------------------------------------------------------------------


    def set_alpha(self, alpha):
        """
        Fix the alpha option.

        @param alpha (int or float)

        """
        self._alpha = alpha

    # End set_alpha
    # ----------------------------------------------------------------------


    # ###################################################################### #
    # Automatic Detection search parameters                                  #
    # ###################################################################### #


    def lemmatize(self, inputtier):
        """
        Lemmatize a tier.

        @param inputtier (Tier)

        """
        if self._use_lemmatize is False:
            return

        lemmatier = inputtier.Copy()

        for i in range(lemmatier.GetSize()):
            lem = self.lemmatizer.get_lem( lemmatier[i].GetLabel().GetValue() )
            lemmatier[i].GetLabel().SetValue( lem )

        return lemmatier

    # ------------------------------------------------------------------


    def relevancy(self, inputtier):
        """
        Add very frequent tokens in a copy of the stopwords list.
        Return a WordsList instance

        Estimate the relevance of each term by using the number of
        occurrences of this term in the input and compare this value
        to a threshold, to add the term (or not) in the stopwords list.

        @param inputtier (Tier)

        """
        l = self.stopwords.copy()

        # Create the Unigram and put data
        u = Unigram()
        for a in inputtier:
            if a.GetLabel().IsSpeech() is True:
                u.add( a.GetLabel().GetValue() )

        # Estimate if a token is relevant, put in the stoplist
        for token in u.get_tokens():
            freq  = u.get_value(token)
            proba = float(freq) / float(u.get_sum())
            relevant = 1.0 / (float(u.get_size())*float(self._alpha))
            if proba > relevant:
                l.add( token )
                if self.logfile is not None:
                    self.logfile.print_message('Add in the stoplist: '+token, indent=3)
                elif DEBUG is True:
                    print(' ... ... ... Add in the stoplist: '+token.encode('utf8'))

        return l

    # End relevancy
    # ------------------------------------------------------------------


    def find_next_break (self, inputtier, start, empan):
        """
        Return the index of the next interval representing a break.
        This depends on the 'empan' value.

        @param start is the position of the token where the search will start

        """
        nbbreaks = 0
        for i in range (start, inputtier.GetSize()):
            if inputtier[i].GetLabel().IsSilence():
                nbbreaks = nbbreaks + 1
                if nbbreaks == empan:
                    return i
        return inputtier.GetSize() - 1

    # End find_next_break
    # ------------------------------------------------------------------


    # ###################################################################### #
    # Automatic Detection search                                             #
    # ###################################################################### #


    def _addrepetition(self, repeatobj, nbrepeats, inputtier1, inputtier2, tokstartsrc, tokstartrep, srctier, reptier):
        """
        Add sources and repetitions
        from repeatobj
        to the tiers.
        """

        n = 0
        for i in range(repeatobj.get_repeats_size()):

            # Source
            s,e = repeatobj.get_repeat_source(i)
            srcbegin = inputtier1[tokstartsrc+s].GetLocation().GetBegin()
            srcend   = inputtier1[tokstartsrc+e].GetLocation().GetEnd()
            time = TimeInterval(srcbegin.Copy(), srcend.Copy())
            srcann = Annotation(time, Label("S"+str(nbrepeats+n)))
            try:
                srctier.Add(srcann)
                if DEBUG:
                    print "src annotation added: ",srcann
            except Exception:
                continue

            # Repetition
            rep = repeatobj.get_repeat_repetition(i)
            for r in rep:
                (s,e) = r
                repbegin = inputtier2[tokstartrep+s].GetLocation().GetBegin()
                repend   = inputtier2[tokstartrep+e].GetLocation().GetEnd()
                r = reptier.Lindex(repbegin) #time)
                l = reptier.Rindex(repend) #time)

                # all other cases (no repetition, overlap)
                time = TimeInterval( repbegin.Copy(), repend.Copy() )
                repann = Annotation(time, Label("R"+str(nbrepeats+n)))
                reptier.Add(repann)
                if DEBUG:
                    print "rep annotation added: ",repann

            n = n + 1
        # end for

        return n



    def selfdetection(self, inputtier1):
        """
        Self-Repetition detection.

        @param inputtier1 (Tier)

        """
        # Verifications: is there any data?
        if inputtier1.IsEmpty() is True:
            raise Exception("Repetition. Empty input tokens tier.\n")

        # Update the stoplist
        if self._use_stopwords is True:
            stpw = self.relevancy( inputtier1 )
        else:
            stpw = self.stopwords

        # Create repeat objects
        repeatobj = Repetitions( )

        # Create output data
        srctier = Tier("Sources")
        reptier = Tier("Repetitions")
        nbrepeats = 1

        # Initialization of tokstart and tokend
        tokstart = 0
        if inputtier1[0].GetLabel().IsDummy():
            tokstart = 1
        toksearch = self.find_next_break( inputtier1, tokstart+1 , empan=1)
        tokend    = self.find_next_break( inputtier1, tokstart+1 , empan=self._empan)

        # Detection is here:
        while tokstart < tokend:

            # Build an array with the tokens
            tokens1 = list()
            for i in range(tokstart, tokend+1):
                tokens1.append( inputtier1[i].GetLabel().GetValue() )
            speaker1 = DataSpeaker( tokens1, stpw )

            # Detect repeats in these data
            repeatobj.detect( speaker1, toksearch-tokstart, None )

            # Save repeats
            if repeatobj.get_repeats_size()>0:
                n = self._addrepetition(repeatobj, nbrepeats, inputtier1, inputtier1, tokstart, tokstart, srctier, reptier)
                nbrepeats = nbrepeats + n

            # Prepare next search
            tokstart  = toksearch
            toksearch = self.find_next_break( inputtier1 , tokstart+1 , empan=1 )
            tokend    = self.find_next_break( inputtier1 , tokstart+1 , empan=self._empan )

        return (srctier,reptier)

    # End selfdetection
    # ------------------------------------------------------------------------


    def otherdetection(self, inputtier1, inputtier2):
        """
        Other-Repetition detection.

        @param inputtier (Tier)

        """
        # Verifications: is there any data?
        if inputtier1.IsEmpty() is True:
            raise Exception("Repetition. Empty input tokens tier.\n")

        # Update the stoplist
        if self._use_stopwords is True:
            # other-repetition: relevance of the echoing-speaker
            stpw = self.relevancy( inputtier2 )
        else:
            stpw = self.stopwords

        # Create repeat objects
        repeatobj = Repetitions( )

        # Create output data
        srctier = Tier("OR-Source")
        reptier = Tier("OR-Repetition")

        nbrepeats = 1

        # Initialization of tokstart, and tokend
        tokstartsrc = 0
        if inputtier1[0].GetLabel().IsDummy():
            tokstartsrc = 1
        tokendsrc = min(20, inputtier1.GetSize()-1)

        # Detection is here:
        # detect() is applied work by word, from tokstart to tokend
        while tokstartsrc < tokendsrc:

            # Build an array with the tokens
            tokens1 = list()
            for i in range(tokstartsrc, tokendsrc):
                tokens1.append( inputtier1[i].GetLabel().GetValue() )
            speaker1 = DataSpeaker( tokens1, stpw )

            # Create speaker2
            tokens2 = list()
            nbbreaks = 0
            tokstartrep = -1
            a = inputtier1[tokstartsrc]

            for (r,b) in enumerate(inputtier2):
                if b.GetLocation().GetBeginMidpoint() >= a.GetLocation().GetBeginMidpoint():
                    if tokstartrep == -1:
                        tokstartrep = r
                    if b.GetLabel().IsSilence():
                        nbbreaks = nbbreaks + 1
                    if nbbreaks == self._empan:
                        break
                    tokens2.append( b.GetLabel().GetValue() )
            speaker2 = DataSpeaker( tokens2, stpw )

            if DEBUG is True:
                print "SRC : ",speaker1
                print "ECHO: ",speaker2

            # Detect repeats in these data: search if the first token of spk1
            # is the first token of a source.
            repeatobj.detect( speaker1, 1, speaker2 )

            # Save repeats
            shift = 1
            if repeatobj.get_repeats_size()>0:
                if DEBUG is True:
                    print " ----> found : "
                    repeatobj.get_repeat(0).print_echo()
                s,e = repeatobj.get_repeat_source(0)
                n = self._addrepetition(repeatobj, nbrepeats, inputtier1, inputtier2, tokstartsrc, tokstartrep, srctier, reptier)
                if n > 0:
                    nbrepeats = nbrepeats + n
                shift = e + 1


            while speaker1.is_token(speaker1.get_token(shift)) is False and shift < 20:
                shift = shift + 1

            tokstartsrc = tokstartsrc + shift
            tokstartsrc = min(tokstartsrc, inputtier1.GetSize()-1)
            tokendsrc   = min(tokstartsrc + 20, inputtier1.GetSize()-1)

        return (srctier,reptier)

    # End otherdetection
    # ------------------------------------------------------------------------


    # ###################################################################### #
    # Run
    # ###################################################################### #


    def run(self, inputfilename1, inputfilename2=None, outputfilename=None):
        """
        Run the Repetition Automatic Detection annotation.

        @param inputfilename
        @param outputfilename

        """
        tokentier1 = None  # First tier
        tokentier2 = -1    # No echoing speaker
        try:
            # Find the token tier
            trsinput1 = annotationdata.io.read( inputfilename1 )
            for i in range( trsinput1.GetSize() ):
                if "token" in trsinput1[i].GetName().lower() and "align" in trsinput1[i].GetName().lower():
                    tokentier1 = i
                    break
            if inputfilename2 is not None:
                #find the token tier
                trsinput2 = annotationdata.io.read( inputfilename2 )
                for i in range( trsinput2.GetSize() ):
                    if "token" in trsinput2[i].GetName().lower() and "align" in trsinput2[i].GetName().lower():
                        tokentier2 = i
                        break
        except Exception as e:
            raise Exception('Repetitions. '+str(e))

        if tokentier1 is None:
            raise Exception('Repetitions. No valid input tier (expected: TokensAlign).')

        # Lemmatize input?
        if self._use_lemmatize is True and self.lemmatizer:
            tier1 = self.lemmatize( trsinput1[tokentier1] )
            if tokentier2 > -1:
                tier2 = self.lemmatize( trsinput2[tokentier2] )
        else:
            tier1 = trsinput1[tokentier1]
            if tokentier2 > -1:
                tier2 = trsinput2[tokentier2]

        if self.logfile is not None:
            self.logfile.print_message("Empan = "+str(self._empan), indent=3)
            self.logfile.print_message("Alpha = "+str(self._alpha), indent=3)

        # Repetition Automatic Detection
        if tokentier2 == -1:
            (srctier,reptier) = self.selfdetection( tier1 )
        else:
            (srctier,reptier) = self.otherdetection( tier1 , tier2 )

        # Manage results:
        # An output file name is given
        if outputfilename:
            trsoutput = Transcription("Repetitions")
            if self._merge is True:
                for i in range( trsinput1.GetSize() ):
                    trsoutput.Add( trsinput1[i] )
        # the repeat tier is added to the input transcription
        else:
            outputfilename = inputfilename1
            trsoutput = annotationdata.io.read( inputfilename1 )

        # Add repeats to this trsoutput
        trsoutput.Append( srctier )
        trsoutput.Append( reptier )

        trsoutput.SetMinTime( trsinput1.GetMinTime() )
        trsoutput.SetMaxTime( trsinput1.GetMaxTime() ) # hum, in case of OR... not sure! to be verified.

        # Save
        annotationdata.io.write( outputfilename, trsoutput )