Ejemplo n.º 1
0
class BilingualConverter(SubtitleConverter):
    """Special converter for handling bilingual subtitles (with the first line
    of each block in one language, and the second line in another).
    """

    def __init__(self, input, output, output2, rawOutput=None,rawOutput2=None, 
                 language=None,language2=None, meta=None, encoding=None, alwaysSplit=False):
        """Creates a new converter for a given input and output (as file
        objects). A second file object for the raw output can also be provided.
        
        Args:
            output2(file object): XML file for second language
            rawOutput2(file object): XML file for second language
            language2(Language object): second language
        
        The other arguments are similar to the SubtitleConverter.
        
        """                 
        SubtitleConverter.__init__(self, input, output, rawOutput, language, 
                                   meta, encoding, alwaysSplit)
        self.encodings += language2.encodings
        detected = detectEncoding(self.inputs[0], self.encodings)
        self.encodings = [detected] + self.encodings
            
        self.lang2 = language2
        self.output2 = output2
        self.rawOutput2 = rawOutput2
          
         
    
    def doConversion(self):   
        """Performs the conversion process, reading the full subtitle file
        and writing the converted content into the output file.
        
        """       
        self.text2 = ""            
        self.nbTokens2 = 0
        self.sid2 = 0
        self.tokeniser2 = Tokeniser(self.lang2)
        self.spellchecker2 = SpellChecker(self.lang2)
        SubtitleConverter.doConversion(self) 
        self.tokeniser2.close()        
        
        
    def _startDocument(self):
        SubtitleConverter._startDocument(self)
        self._switchLanguage()
        SubtitleConverter._startDocument(self)
        self._switchLanguage()
        
    def _flushDocument(self):
        SubtitleConverter._flushDocument(self)
        self._switchLanguage()
        SubtitleConverter._flushDocument(self)
        self._switchLanguage()
          
    
    def _writeBlock(self, block):
        """Processes the block content (doing sentence segmentation and 
        tokenisation) in the particular case of "bilingual" subtitles, where
        two languages are used in the subtitle (one on each line).
        
        In this setting, we always split sentences at the end of each block.
        
        """
        # Loops on each line of the subtitle block
        for linenum in range(0,len(block.lines)):
            self.sentence = Sentence()
            self.sentence.addStamp("T%sS"%block.id, block.start)
            self._recordLine(block, linenum)  
            self.sentence.addStamp("T%sE"%block.id, block.start)
            self._flushSentence()
            if not linenum:
                self._switchLanguage()
        if block.lines:
            self._switchLanguage()
         
    
       
    def _switchLanguage(self):
        self.output, self.output2 = self.output2, self.output
        self.rawOutput, self.rawOutput2 = self.rawOutput2, self.rawOutput
        self.text, self.text2 = self.text2, self.text
        self.nbTokens, self.nbTokens2 = self.nbTokens2, self.nbTokens
        self.lang, self.lang2 = self.lang2, self.lang
        self.sid, self.sid2 = self.sid2, self.sid
        self.tokeniser, self.tokeniser2 = self.tokeniser2, self.tokeniser
        self.spellchecker, self.spellchecker2 = self.spellchecker2, self.spellchecker
         

    def closeOutputs(self):           
        SubtitleConverter.closeOutputs(self)           
        if self.output2 != sys.stdout.buffer:
            self.output2.close()
        if self.rawOutput2:
            self.rawOutput2.close()
Ejemplo n.º 2
0
class SubtitleConverter:

    def __init__(self, input, output, rawOutput=None, language=None, 
                meta=None, encoding=None, alwaysSplit=False):
        """Creates a new converter for a given input and output (as file
        objects). A second file object for the raw output can also be provided.
        
        Args:
            input(file object or list of file objects): raw subtitle files
            output(file object): XML subtitle file for the tokenised output
            rawOutput(file object): XML subtitle file for the untokenised output
            language(Language object): language for the subtitle
            meta(dict): meta-data to append to the end of the XML file(s)
            encoding(str): file encoding to use to read the raw subtitle files
            alwaysSplit(bool): whether to always split subtitle blocks as new 
                sentences (default is false).
        """
                    
        self.lang = language
        self.alwaysSplit = alwaysSplit
        if self.lang and self.lang.alwaysSplit:
            self.alwaysSplit = True
            
        self.inputs = input if isinstance(input,list) else [input]
        
        self.encodings = [encoding] if encoding else []
        self.encodings += (self.lang.encodings if self.lang else [])
        if not self.lang or self.lang.codes[0] in difficult_langs:
            detected = detectEncoding(self.inputs[0], self.encodings)
            self.encodings = [detected] + self.encodings
                        
        self.output = output
        self.rawOutput = rawOutput   
        self.meta = meta    
        
    
    def doConversion(self):   
        """Performs the conversion process, reading the full subtitle file
        and writing the converted content into the output file.
        """  
             
        self.curLine = None                     # Current line in the raw file
        self.curBlock = None                    # Current block
        self.curLineIndex = 0                   # Current line index in the raw file
        self.timeOffset = 0                     # Time offset (for multi-CD subtitles)

        self.sid = 0                            # Current sentence identifier
        self.nbTokens = 0                       # Total number of words
        self.nbIgnoredBlocks = 0                # Number of ignored subtitle blocks
        self.sentence = Sentence()     # Tokens in the current sentence
        self.text = ""                          # Collection of all subtitle lines
        
        # Starting the tokeniser and spellchecker
        self.tokeniser = Tokeniser(self.lang)
        self.spellchecker = SpellChecker(self.lang)
        
        self._startDocument()       
    
        # Looping on the subtitle blocks
        block = self._readBlock()
        while block:              
            # Ignoring spurious subtitle blocks    
            if block.isSpurious():
                self.nbIgnoredBlocks += 1
                block = self._readBlock()
                continue
            
            self._writeBlock(block)           
            block = self._readBlock()
            
        self._flushDocument()
        self.tokeniser.close()
        
       
    def _startDocument(self):
        """Writes the header of the XML subtitle file. 
        
        """
        id = self.meta["id"] if self.meta and "id" in self.meta else ""
        if not id and self.inputs and  hasattr(self.inputs[0],"name"):
            id = os.path.basename(self.inputs[0].name).split(".")[0]
        id = id.encode("utf-8")
            
        self.output.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
        self.output.write(b'<document id="' + id + b'">\n')
        if self.rawOutput:
            self.rawOutput.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
            self.rawOutput.write(b'<document id="' + id + b'">\n') 
              
   
    def _readBlock(self, recursive=0):
        """Reads one subtitle block and returns it.
          
        """
        block = SubtitleBlock()
        block.previous = self.curBlock
        block.offset = self.timeOffset
        
        # Reads the very first line
        if not self.curLine:
            self._readline()
        elif recursive > 20:
            raise RuntimeError("Wrong encoding format for subtitle")
            
        # Continues until a non-empty line is found
        while self.curLine and not self.curLine.strip():
            self._readline()

        # If we arrive at the end of the file object, checks whether any
        # other file should be read (in case of multi-CD subtitles). If yes,
        # opens the new file and continue.  Else, returns None.
        if not self.curLine:
            self.inputs.pop(0)
            self.curLineIndex = 0
            
            if self.inputs:
                nextBlock = self._readBlock()
                lasttime = tosecs(block.previous.end) if block.previous else 0
                # shifting the start and end times after the first CD
                if nextBlock and nextBlock.start and lasttime > tosecs(nextBlock.start):
                    nextBlock.start = addsecs(nextBlock.start, lasttime-self.timeOffset)
                    nextBlock.end = addsecs(nextBlock.end, lasttime-self.timeOffset)
                    self.timeOffset = lasttime
                return nextBlock
            else:
                return None
                  
        # Detects the subtitle identifier
        numberMatch = numberRegex.match(self.curLine)
        if numberMatch:
            block.setId(int(numberMatch.group(1)))
            self._readline()
        else:        
            block.setId((self.curBlock.id+1) if self.curBlock else 1)
         
        # Ignores empty lines
        while self.curLine and not self.curLine.strip():
            self._readline() 
            
        # Detects the start and end time           
        timingMatch = timingRegex.match(self.curLine)
        if not timingMatch:
            sys.stderr.write("Cannot parse timing (line number: %i): %s"
                             %(self.curLineIndex, self.curLine))
            self._readline()
            self.nbIgnoredBlocks += 1
            return self._readBlock(recursive+1)
        block.setTiming(timingMatch.group(1), timingMatch.group(2)) 
   
        # Reads the subtitle content until we arrive at the next subtitle ID
        # or the end of the file (NB: simply stopping at an empty line does
        # not always work, since some files strangely contain empty lines 
        # within subtitle blocks).
        self._readline()
        while self.curLine.strip():
            block.addLine(self.curLine)
            self._readline()          
        while self.curLine and not numberRegex.match(self.curLine):
            block.addLine(self.curLine)
            self._readline()

        self.curBlock = block
        return block
    
    
    def _readline(self):
        """ Reads the next line in the file, decodes it according to the
        current encoding, and returns it. If a decoding error is detected,
        tries to change the encoding if an alternative is possible.
        
        """
        if self.inputs:
            binaryLine = self.inputs[0].readline()
        self.curLine = None
        while self.curLine==None and self.encodings:
            encoding = self.encodings[0]  
            try:
                self.curLine = binaryLine.decode(encoding)
            except UnicodeDecodeError:
                # If we get a decoding error, removes the encoding from
                # the list of possible encodings, and retry.
                self.encodings.remove(encoding)

        if self.curLine==None:
            raise RuntimeError("Decoding error (encoding: %s, line: %i)"
                             %(encoding, self.curLineIndex))
        elif self.curLineIndex==0:
            self.curLine = self.curLine.lstrip("\ufeff")

        self.curLineIndex += 1
  
                      
    def _writeBlock(self, block):
        """ Processes the block content by doing sentence segmentation, 
        tokenisation, and writes the results into the XML file.
        
        """
        # First check whether the block is a continuation of the previous
        # sentence. If not, "flush" the current sentence to start a new one.
        if not self._isContinuation(block):    
            self._flushSentence()
         
        self.sentence.addStamp("T%sS"%block.id, block.start)
        
        # Loops on each line of the subtitle block
        for linenum in range(0,len(block.lines)):
            self.sentence.addRawChar(' ' if self.sentence.raw else '')
            self._recordLine(block, linenum)       
            
        self.sentence.addStamp("T%sE"%block.id, block.end)
    
        
  
    def _recordLine(self, block, linenum):
        """ Records the subtitle line, checking for the occurrence of 
        end-of-sentence markers along the way, and flushing the current 
        sentence in that case.
        
        """
        # Doing the actual tokenisation
        line = block.lines[linenum]
        tokens = self.tokeniser.tokenise(line)   
        curPos = 0       # Current character position in the line

        upperline = len([c for c in line if c.isupper() or not c.isalpha()]) > 2*len(line)/3
        for i, token in enumerate(tokens):  
            
            curPos += len(token)

            # Assume a new sentence if an utterance started with "-" is found   
            if (token=="-" and i < len(tokens)-1 and 
                (tokens[i+1][0].isupper() or (self.lang and self.lang.unicase))):
                self._flushSentence()

            # Handle all-uppercase tokens
            emphasised = block.isEmphasised(linenum, curPos)
            prev = self.sentence.lastToken
            if token.isupper() and ((not token.istitle() and self.spellchecker.lm) or upperline):
                corrected = self.spellchecker.recapitalise(token, prev, upperline)
                if corrected != token:
                    self.sentence.addToken(token, emphasised|(not upperline), alternative=corrected)
                else:
                    self.sentence.addToken(token, emphasised)
           
            # Usual case
            else:
                 corrected, prob = self.spellchecker.spellcheck(token, prev)
                 if prev in stopPunctuations2 and corrected.istitle():
                     self._flushSentence()
                 emphasised = block.isEmphasised(linenum, curPos)
                 if corrected == token:
                     self.sentence.addToken(token, emphasised)
                 elif prob > 0.8:
                     self.sentence.addToken(corrected, emphasised, initial=token)
                 else:
                     self.sentence.addToken(token, emphasised, alternative=corrected)
                
            while curPos < len(line) and line[curPos].isspace():
                self.sentence.addRawChar(line[curPos])
                curPos += 1
                
            # Do not flush the sentence for the last token in the last line
            if ((linenum==len(block.lines)-1 and i==len(tokens)-1)
                or (i < len(tokens)-1 and tokens[i+1]=="\"")):
                continue
            
            if token[0] in stopPunctuations1:
                self._flushSentence()
            elif (token[0] in stopPunctuations2 and i > 0 and 
                (i==len(tokens)-1 or tokens[i+1][0].isupper() or tokens[i+1][0]=="l"
                 or (self.lang and self.lang.unicase))):
                self._flushSentence()
                    
 
    
    def _isContinuation(self, block):
        """Returns true if the block is likely to be a continuation of the current
        sentence
        
        """
        if (not self.sentence or not block.lines 
            or not block.previous or not block.previous.lines):
            return True
        elif self.alwaysSplit:
            return False
         
        score = 0     #Initial continuation score
        
        # Scoring based on the end of the previous block
        lastline = block.previous.lines[-1].rstrip(")]} ")
        stopEndings = stopPunctuations1 + stopPunctuations2 + ["\""]
        if lastline.endswith("..."):
            score += 2
        elif lastline and lastline[-1] in stopEndings:
            score += -3
            
        # Scoring based on the beginning of the current block
        newline = block.lines[0].lstrip("'[*# ")
        if not newline:
            score += -2        
        elif lastline.endswith("-") and newline.startswith("-"):
            score += 2
        elif newline[0] in ["-","\"", "¿", "¡", "'"]:
            score += -2
        elif newline.startswith("..."):
            score += 2
        elif newline[0].isupper():
            score += -3
        elif newline[0].islower():     
            score += 2        
        elif newline[0].isnumeric() or (self.lang and self.lang.unicase):
            score += 1

        # Scoring based on time gaps
        if block.start and block.previous.end:
            pause = tosecs(block.start) - tosecs(block.previous.end)
            score += (-1 if pause > PAUSE_THR1 else 0)
            score += (-1 if pause > PAUSE_THR2 else 0)
            
        # Scoring based on sentence lengths
        score += (-1 if self.sentence.getNbStamps() >3 else 0)
        score += (-1 if self.sentence.getNbTokens() > WORDS_THR else 0)  
        return True if score > 0 else False
        


    def _flushSentence(self):
        """ Writes the tokens to the XML file (and the untokenised output if
        that option is activated) and clears the current sentence.
        
        """ 
        nbTokens = self.sentence.getNbTokens()
        if not nbTokens:
            return
        self.nbTokens += nbTokens 
        self.sid += 1
        self._pruneTokens()           
        self._writeTokens()
        if self.rawOutput:
            self._writeRaw()
        
        # We record the text content for language identification purposes
        self.text += self.sentence.rawCorrected + "\n"
        
        self.sentence = Sentence() 
        
  
  
    def _pruneTokens(self):
        entities= self.sentence.entities
        for i in range(1, len(entities)-4):
            if (entities[i][0]=="w"  and (entities[i][1]=="..." or entities[i][1]=="-") 
                and entities[i+1][0]=="time" and entities[i+2][0]=="time" 
                and entities[i+3][0]=="w" and entities[i+3][1]==entities[i][1]):
                self.sentence.entities = entities[0:i] +entities[i+1:i+3] + entities[i+4:]
                self.sentence.raw = self.sentence.raw.replace("... ...", " ")
                self.sentence.raw = self.sentence.raw.replace("- -", " ")
                break
        
          
    def _writeTokens(self):
        """ Writes the tokens in self.sentence to the XML file. 
                
        """
        builder = et.TreeBuilder()  
        sattrs = {"id":str(self.sid)}
        if self.sentence.isEmphasised():
            sattrs.update({"emphasis":"true"})
            for w in self.sentence.getTokens():
                del w[2]["emphasis"]
        builder.start("s",sattrs)
        tokid = 0
        entities= self.sentence.getEntities()
        for i, entity in enumerate(entities):
            
            if entity[0]=="w":
                token = entity[1]
                tokid += 1
              
                builder.data("\n    ")
                wattrs = {"id":"%i.%i"%(self.sid,tokid)}
                wattrs.update(entity[2])
                builder.start("w",wattrs)
                builder.data(token)
                builder.end("w") 
                     
            # Write a <time> entity
            elif entity[0]=="time":
                builder.data("\n    ")
                builder.start("time",entity[1])
                builder.end("time")    
          
        builder.data("\n  ")
        builder.end("s")
        tree = et.ElementTree(builder.close())
        self.output.write(b"  ")
        tree.write(self.output, encoding='utf-8')
        self.output.write(b"\n") 
        
        
               
    def _writeRaw(self):
        """ Writes the raw sentence to the XML file. 
                
        """
        builder = et.TreeBuilder()  
        attrs =  {"id":str(self.sid)}
        builder.start("s",attrs)

        # Add timing info at the beginning of the sentence
        entities = self.sentence.getEntities()
        if entities and entities[0][0] == "time":
            builder.data("\n    ")
            builder.start("time",entities[0][1])
            builder.end("time")  
            
        builder.data("\n")
        builder.data(self.sentence.raw)
          
        # Add timing info at the end of the sentence
        if entities and entities[-1][0] == "time":
            builder.data("\n    ")
            builder.start("time",entities[-1][1])
            builder.end("time")   
          
        builder.data("\n  ")
        builder.end("s")
        tree = et.ElementTree(builder.close())
        self.rawOutput.write(b"  ")
        tree.write(self.rawOutput, encoding='utf-8')
        self.rawOutput.write(b"\n") 
    
    
    
    def _extractMetadata(self):
        """ Extracts meta-data on the subtitle and the conversion process,
        in order to append it to the end of the XML file.
        
        """
        meta = self.meta if self.meta else {}
        if "id" in meta:
            del meta["id"]
        meta["subtitle"] = meta["subtitle"] if "subtitle" in meta else {}
        meta["conversion"] = {}
        if self.lang:
             meta["subtitle"]["language"] = self.lang.name
             # Performs language identification
             langProb = self.lang.getProb(self.text)
             if langProb < 0.1 and not isinstance(self, BilingualConverter):
                 msg = "Subtitle is not encoded in " + self.lang.name
                 msg += " (distrib: " + str(utils.getProbDist(self.text)) + ")"
                 raise RuntimeError(msg)
             meta["subtitle"]["confidence"] = str(langProb)
        
        if self.curBlock:
            meta["subtitle"]["blocks"] = str(self.curBlock.id)
            meta["subtitle"]["duration"] = self.curBlock.end
   
        meta["conversion"]["sentences"] = str(self.sid)
        meta["conversion"]["tokens"] = str(self.nbTokens)
        meta["conversion"]["encoding"] = self.encodings[0]
        meta["conversion"]["ignored_blocks"] = str(self.nbIgnoredBlocks)
        sc = self.spellchecker
        meta["conversion"]["unknown_words"] = str(sc.nbUnknowns)
        meta["conversion"]["corrected_words"] = str(sc.nbCorrections)
        meta["conversion"]["truecased_words"] = str(sc.nbTruecased)
        return meta
    
    
    def _flushDocument(self):
        """ Adds the final meta-data to the XML file, and closes the XML document.
        
        """
        self._flushSentence()
        meta = self._extractMetadata()
        metaBuilder = et.TreeBuilder()
        metaBuilder.start("meta")
        
        for part in meta:
            metaBuilder.data("\n    ")
            metaBuilder.start(part)
            if isinstance(meta[part],dict):
                for key in meta[part]:
                    metaBuilder.data("\n      ")
                    metaBuilder.start(key)
                    metaBuilder.data(meta[part][key])
                    metaBuilder.end(key)
            metaBuilder.data("\n    ")
            metaBuilder.end(part)
        
        metaBuilder.data("\n  ")   
        metaBuilder.end("meta")
        tree = et.ElementTree(metaBuilder.close())
        for fd in [self.output,self.rawOutput]:
            if fd:
                fd.write(b"  ")
                tree.write(fd, encoding='utf-8')
                fd.write(b"\n</document>\n") 
                
    
    def closeOutputs(self):           
        if self.output != sys.stdout.buffer:
            self.output.close()
        if self.rawOutput:
            self.rawOutput.close()