Ejemplo n.º 1
0
    def doConversion(self):   
        """Performs the conversion process, reading the full subtitle file
        and writing the converted content into the output file.
        """  
             
        self.curLine = None                     # Current line in the raw file
        self.curBlock = None                    # Current block
        self.curLineIndex = 0                   # Current line index in the raw file
        self.timeOffset = 0                     # Time offset (for multi-CD subtitles)

        self.sid = 0                            # Current sentence identifier
        self.nbTokens = 0                       # Total number of words
        self.nbIgnoredBlocks = 0                # Number of ignored subtitle blocks
        self.sentence = Sentence()     # Tokens in the current sentence
        self.text = ""                          # Collection of all subtitle lines
        
        # Starting the tokeniser and spellchecker
        self.tokeniser = Tokeniser(self.lang)
        self.spellchecker = SpellChecker(self.lang)
        
        self._startDocument()       
    
        # Looping on the subtitle blocks
        block = self._readBlock()
        while block:              
            # Ignoring spurious subtitle blocks    
            if block.isSpurious():
                self.nbIgnoredBlocks += 1
                block = self._readBlock()
                continue
            
            self._writeBlock(block)           
            block = self._readBlock()
            
        self._flushDocument()
        self.tokeniser.close()
Ejemplo n.º 2
0
class SubtitleConverter:

    def __init__(self, input, output, rawOutput=None, language=None, 
                meta=None, encoding=None, alwaysSplit=False):
        """Creates a new converter for a given input and output (as file
        objects). A second file object for the raw output can also be provided.
        
        Args:
            input(file object or list of file objects): raw subtitle files
            output(file object): XML subtitle file for the tokenised output
            rawOutput(file object): XML subtitle file for the untokenised output
            language(Language object): language for the subtitle
            meta(dict): meta-data to append to the end of the XML file(s)
            encoding(str): file encoding to use to read the raw subtitle files
            alwaysSplit(bool): whether to always split subtitle blocks as new 
                sentences (default is false).
        """
                    
        self.lang = language
        self.alwaysSplit = alwaysSplit
        if self.lang and self.lang.alwaysSplit:
            self.alwaysSplit = True
            
        self.inputs = input if isinstance(input,list) else [input]
        
        self.encodings = [encoding] if encoding else []
        self.encodings += (self.lang.encodings if self.lang else [])
        if not self.lang or self.lang.codes[0] in difficult_langs:
            detected = detectEncoding(self.inputs[0], self.encodings)
            self.encodings = [detected] + self.encodings
                        
        self.output = output
        self.rawOutput = rawOutput   
        self.meta = meta    
        
    
    def doConversion(self):   
        """Performs the conversion process, reading the full subtitle file
        and writing the converted content into the output file.
        """  
             
        self.curLine = None                     # Current line in the raw file
        self.curBlock = None                    # Current block
        self.curLineIndex = 0                   # Current line index in the raw file
        self.timeOffset = 0                     # Time offset (for multi-CD subtitles)

        self.sid = 0                            # Current sentence identifier
        self.nbTokens = 0                       # Total number of words
        self.nbIgnoredBlocks = 0                # Number of ignored subtitle blocks
        self.sentence = Sentence()     # Tokens in the current sentence
        self.text = ""                          # Collection of all subtitle lines
        
        # Starting the tokeniser and spellchecker
        self.tokeniser = Tokeniser(self.lang)
        self.spellchecker = SpellChecker(self.lang)
        
        self._startDocument()       
    
        # Looping on the subtitle blocks
        block = self._readBlock()
        while block:              
            # Ignoring spurious subtitle blocks    
            if block.isSpurious():
                self.nbIgnoredBlocks += 1
                block = self._readBlock()
                continue
            
            self._writeBlock(block)           
            block = self._readBlock()
            
        self._flushDocument()
        self.tokeniser.close()
        
       
    def _startDocument(self):
        """Writes the header of the XML subtitle file. 
        
        """
        id = self.meta["id"] if self.meta and "id" in self.meta else ""
        if not id and self.inputs and  hasattr(self.inputs[0],"name"):
            id = os.path.basename(self.inputs[0].name).split(".")[0]
        id = id.encode("utf-8")
            
        self.output.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
        self.output.write(b'<document id="' + id + b'">\n')
        if self.rawOutput:
            self.rawOutput.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
            self.rawOutput.write(b'<document id="' + id + b'">\n') 
              
   
    def _readBlock(self, recursive=0):
        """Reads one subtitle block and returns it.
          
        """
        block = SubtitleBlock()
        block.previous = self.curBlock
        block.offset = self.timeOffset
        
        # Reads the very first line
        if not self.curLine:
            self._readline()
        elif recursive > 20:
            raise RuntimeError("Wrong encoding format for subtitle")
            
        # Continues until a non-empty line is found
        while self.curLine and not self.curLine.strip():
            self._readline()

        # If we arrive at the end of the file object, checks whether any
        # other file should be read (in case of multi-CD subtitles). If yes,
        # opens the new file and continue.  Else, returns None.
        if not self.curLine:
            self.inputs.pop(0)
            self.curLineIndex = 0
            
            if self.inputs:
                nextBlock = self._readBlock()
                lasttime = tosecs(block.previous.end) if block.previous else 0
                # shifting the start and end times after the first CD
                if nextBlock and nextBlock.start and lasttime > tosecs(nextBlock.start):
                    nextBlock.start = addsecs(nextBlock.start, lasttime-self.timeOffset)
                    nextBlock.end = addsecs(nextBlock.end, lasttime-self.timeOffset)
                    self.timeOffset = lasttime
                return nextBlock
            else:
                return None
                  
        # Detects the subtitle identifier
        numberMatch = numberRegex.match(self.curLine)
        if numberMatch:
            block.setId(int(numberMatch.group(1)))
            self._readline()
        else:        
            block.setId((self.curBlock.id+1) if self.curBlock else 1)
         
        # Ignores empty lines
        while self.curLine and not self.curLine.strip():
            self._readline() 
            
        # Detects the start and end time           
        timingMatch = timingRegex.match(self.curLine)
        if not timingMatch:
            sys.stderr.write("Cannot parse timing (line number: %i): %s"
                             %(self.curLineIndex, self.curLine))
            self._readline()
            self.nbIgnoredBlocks += 1
            return self._readBlock(recursive+1)
        block.setTiming(timingMatch.group(1), timingMatch.group(2)) 
   
        # Reads the subtitle content until we arrive at the next subtitle ID
        # or the end of the file (NB: simply stopping at an empty line does
        # not always work, since some files strangely contain empty lines 
        # within subtitle blocks).
        self._readline()
        while self.curLine.strip():
            block.addLine(self.curLine)
            self._readline()          
        while self.curLine and not numberRegex.match(self.curLine):
            block.addLine(self.curLine)
            self._readline()

        self.curBlock = block
        return block
    
    
    def _readline(self):
        """ Reads the next line in the file, decodes it according to the
        current encoding, and returns it. If a decoding error is detected,
        tries to change the encoding if an alternative is possible.
        
        """
        if self.inputs:
            binaryLine = self.inputs[0].readline()
        self.curLine = None
        while self.curLine==None and self.encodings:
            encoding = self.encodings[0]  
            try:
                self.curLine = binaryLine.decode(encoding)
            except UnicodeDecodeError:
                # If we get a decoding error, removes the encoding from
                # the list of possible encodings, and retry.
                self.encodings.remove(encoding)

        if self.curLine==None:
            raise RuntimeError("Decoding error (encoding: %s, line: %i)"
                             %(encoding, self.curLineIndex))
        elif self.curLineIndex==0:
            self.curLine = self.curLine.lstrip("\ufeff")

        self.curLineIndex += 1
  
                      
    def _writeBlock(self, block):
        """ Processes the block content by doing sentence segmentation, 
        tokenisation, and writes the results into the XML file.
        
        """
        # First check whether the block is a continuation of the previous
        # sentence. If not, "flush" the current sentence to start a new one.
        if not self._isContinuation(block):    
            self._flushSentence()
         
        self.sentence.addStamp("T%sS"%block.id, block.start)
        
        # Loops on each line of the subtitle block
        for linenum in range(0,len(block.lines)):
            self.sentence.addRawChar(' ' if self.sentence.raw else '')
            self._recordLine(block, linenum)       
            
        self.sentence.addStamp("T%sE"%block.id, block.end)
    
        
  
    def _recordLine(self, block, linenum):
        """ Records the subtitle line, checking for the occurrence of 
        end-of-sentence markers along the way, and flushing the current 
        sentence in that case.
        
        """
        # Doing the actual tokenisation
        line = block.lines[linenum]
        tokens = self.tokeniser.tokenise(line)   
        curPos = 0       # Current character position in the line

        upperline = len([c for c in line if c.isupper() or not c.isalpha()]) > 2*len(line)/3
        for i, token in enumerate(tokens):  
            
            curPos += len(token)

            # Assume a new sentence if an utterance started with "-" is found   
            if (token=="-" and i < len(tokens)-1 and 
                (tokens[i+1][0].isupper() or (self.lang and self.lang.unicase))):
                self._flushSentence()

            # Handle all-uppercase tokens
            emphasised = block.isEmphasised(linenum, curPos)
            prev = self.sentence.lastToken
            if token.isupper() and ((not token.istitle() and self.spellchecker.lm) or upperline):
                corrected = self.spellchecker.recapitalise(token, prev, upperline)
                if corrected != token:
                    self.sentence.addToken(token, emphasised|(not upperline), alternative=corrected)
                else:
                    self.sentence.addToken(token, emphasised)
           
            # Usual case
            else:
                 corrected, prob = self.spellchecker.spellcheck(token, prev)
                 if prev in stopPunctuations2 and corrected.istitle():
                     self._flushSentence()
                 emphasised = block.isEmphasised(linenum, curPos)
                 if corrected == token:
                     self.sentence.addToken(token, emphasised)
                 elif prob > 0.8:
                     self.sentence.addToken(corrected, emphasised, initial=token)
                 else:
                     self.sentence.addToken(token, emphasised, alternative=corrected)
                
            while curPos < len(line) and line[curPos].isspace():
                self.sentence.addRawChar(line[curPos])
                curPos += 1
                
            # Do not flush the sentence for the last token in the last line
            if ((linenum==len(block.lines)-1 and i==len(tokens)-1)
                or (i < len(tokens)-1 and tokens[i+1]=="\"")):
                continue
            
            if token[0] in stopPunctuations1:
                self._flushSentence()
            elif (token[0] in stopPunctuations2 and i > 0 and 
                (i==len(tokens)-1 or tokens[i+1][0].isupper() or tokens[i+1][0]=="l"
                 or (self.lang and self.lang.unicase))):
                self._flushSentence()
                    
 
    
    def _isContinuation(self, block):
        """Returns true if the block is likely to be a continuation of the current
        sentence
        
        """
        if (not self.sentence or not block.lines 
            or not block.previous or not block.previous.lines):
            return True
        elif self.alwaysSplit:
            return False
         
        score = 0     #Initial continuation score
        
        # Scoring based on the end of the previous block
        lastline = block.previous.lines[-1].rstrip(")]} ")
        stopEndings = stopPunctuations1 + stopPunctuations2 + ["\""]
        if lastline.endswith("..."):
            score += 2
        elif lastline and lastline[-1] in stopEndings:
            score += -3
            
        # Scoring based on the beginning of the current block
        newline = block.lines[0].lstrip("'[*# ")
        if not newline:
            score += -2        
        elif lastline.endswith("-") and newline.startswith("-"):
            score += 2
        elif newline[0] in ["-","\"", "¿", "¡", "'"]:
            score += -2
        elif newline.startswith("..."):
            score += 2
        elif newline[0].isupper():
            score += -3
        elif newline[0].islower():     
            score += 2        
        elif newline[0].isnumeric() or (self.lang and self.lang.unicase):
            score += 1

        # Scoring based on time gaps
        if block.start and block.previous.end:
            pause = tosecs(block.start) - tosecs(block.previous.end)
            score += (-1 if pause > PAUSE_THR1 else 0)
            score += (-1 if pause > PAUSE_THR2 else 0)
            
        # Scoring based on sentence lengths
        score += (-1 if self.sentence.getNbStamps() >3 else 0)
        score += (-1 if self.sentence.getNbTokens() > WORDS_THR else 0)  
        return True if score > 0 else False
        


    def _flushSentence(self):
        """ Writes the tokens to the XML file (and the untokenised output if
        that option is activated) and clears the current sentence.
        
        """ 
        nbTokens = self.sentence.getNbTokens()
        if not nbTokens:
            return
        self.nbTokens += nbTokens 
        self.sid += 1
        self._pruneTokens()           
        self._writeTokens()
        if self.rawOutput:
            self._writeRaw()
        
        # We record the text content for language identification purposes
        self.text += self.sentence.rawCorrected + "\n"
        
        self.sentence = Sentence() 
        
  
  
    def _pruneTokens(self):
        entities= self.sentence.entities
        for i in range(1, len(entities)-4):
            if (entities[i][0]=="w"  and (entities[i][1]=="..." or entities[i][1]=="-") 
                and entities[i+1][0]=="time" and entities[i+2][0]=="time" 
                and entities[i+3][0]=="w" and entities[i+3][1]==entities[i][1]):
                self.sentence.entities = entities[0:i] +entities[i+1:i+3] + entities[i+4:]
                self.sentence.raw = self.sentence.raw.replace("... ...", " ")
                self.sentence.raw = self.sentence.raw.replace("- -", " ")
                break
        
          
    def _writeTokens(self):
        """ Writes the tokens in self.sentence to the XML file. 
                
        """
        builder = et.TreeBuilder()  
        sattrs = {"id":str(self.sid)}
        if self.sentence.isEmphasised():
            sattrs.update({"emphasis":"true"})
            for w in self.sentence.getTokens():
                del w[2]["emphasis"]
        builder.start("s",sattrs)
        tokid = 0
        entities= self.sentence.getEntities()
        for i, entity in enumerate(entities):
            
            if entity[0]=="w":
                token = entity[1]
                tokid += 1
              
                builder.data("\n    ")
                wattrs = {"id":"%i.%i"%(self.sid,tokid)}
                wattrs.update(entity[2])
                builder.start("w",wattrs)
                builder.data(token)
                builder.end("w") 
                     
            # Write a <time> entity
            elif entity[0]=="time":
                builder.data("\n    ")
                builder.start("time",entity[1])
                builder.end("time")    
          
        builder.data("\n  ")
        builder.end("s")
        tree = et.ElementTree(builder.close())
        self.output.write(b"  ")
        tree.write(self.output, encoding='utf-8')
        self.output.write(b"\n") 
        
        
               
    def _writeRaw(self):
        """ Writes the raw sentence to the XML file. 
                
        """
        builder = et.TreeBuilder()  
        attrs =  {"id":str(self.sid)}
        builder.start("s",attrs)

        # Add timing info at the beginning of the sentence
        entities = self.sentence.getEntities()
        if entities and entities[0][0] == "time":
            builder.data("\n    ")
            builder.start("time",entities[0][1])
            builder.end("time")  
            
        builder.data("\n")
        builder.data(self.sentence.raw)
          
        # Add timing info at the end of the sentence
        if entities and entities[-1][0] == "time":
            builder.data("\n    ")
            builder.start("time",entities[-1][1])
            builder.end("time")   
          
        builder.data("\n  ")
        builder.end("s")
        tree = et.ElementTree(builder.close())
        self.rawOutput.write(b"  ")
        tree.write(self.rawOutput, encoding='utf-8')
        self.rawOutput.write(b"\n") 
    
    
    
    def _extractMetadata(self):
        """ Extracts meta-data on the subtitle and the conversion process,
        in order to append it to the end of the XML file.
        
        """
        meta = self.meta if self.meta else {}
        if "id" in meta:
            del meta["id"]
        meta["subtitle"] = meta["subtitle"] if "subtitle" in meta else {}
        meta["conversion"] = {}
        if self.lang:
             meta["subtitle"]["language"] = self.lang.name
             # Performs language identification
             langProb = self.lang.getProb(self.text)
             if langProb < 0.1 and not isinstance(self, BilingualConverter):
                 msg = "Subtitle is not encoded in " + self.lang.name
                 msg += " (distrib: " + str(utils.getProbDist(self.text)) + ")"
                 raise RuntimeError(msg)
             meta["subtitle"]["confidence"] = str(langProb)
        
        if self.curBlock:
            meta["subtitle"]["blocks"] = str(self.curBlock.id)
            meta["subtitle"]["duration"] = self.curBlock.end
   
        meta["conversion"]["sentences"] = str(self.sid)
        meta["conversion"]["tokens"] = str(self.nbTokens)
        meta["conversion"]["encoding"] = self.encodings[0]
        meta["conversion"]["ignored_blocks"] = str(self.nbIgnoredBlocks)
        sc = self.spellchecker
        meta["conversion"]["unknown_words"] = str(sc.nbUnknowns)
        meta["conversion"]["corrected_words"] = str(sc.nbCorrections)
        meta["conversion"]["truecased_words"] = str(sc.nbTruecased)
        return meta
    
    
    def _flushDocument(self):
        """ Adds the final meta-data to the XML file, and closes the XML document.
        
        """
        self._flushSentence()
        meta = self._extractMetadata()
        metaBuilder = et.TreeBuilder()
        metaBuilder.start("meta")
        
        for part in meta:
            metaBuilder.data("\n    ")
            metaBuilder.start(part)
            if isinstance(meta[part],dict):
                for key in meta[part]:
                    metaBuilder.data("\n      ")
                    metaBuilder.start(key)
                    metaBuilder.data(meta[part][key])
                    metaBuilder.end(key)
            metaBuilder.data("\n    ")
            metaBuilder.end(part)
        
        metaBuilder.data("\n  ")   
        metaBuilder.end("meta")
        tree = et.ElementTree(metaBuilder.close())
        for fd in [self.output,self.rawOutput]:
            if fd:
                fd.write(b"  ")
                tree.write(fd, encoding='utf-8')
                fd.write(b"\n</document>\n") 
                
    
    def closeOutputs(self):           
        if self.output != sys.stdout.buffer:
            self.output.close()
        if self.rawOutput:
            self.rawOutput.close()