def doConversion(self): """Performs the conversion process, reading the full subtitle file and writing the converted content into the output file. """ self.text2 = "" self.nbTokens2 = 0 self.sid2 = 0 self.tokeniser2 = Tokeniser(self.lang2) self.spellchecker2 = SpellChecker(self.lang2) SubtitleConverter.doConversion(self) self.tokeniser2.close()
def doConversion(self): """Performs the conversion process, reading the full subtitle file and writing the converted content into the output file. """ self.curLine = None # Current line in the raw file self.curBlock = None # Current block self.curLineIndex = 0 # Current line index in the raw file self.timeOffset = 0 # Time offset (for multi-CD subtitles) self.sid = 0 # Current sentence identifier self.nbTokens = 0 # Total number of words self.nbIgnoredBlocks = 0 # Number of ignored subtitle blocks self.sentence = Sentence() # Tokens in the current sentence self.text = "" # Collection of all subtitle lines # Starting the tokeniser and spellchecker self.tokeniser = Tokeniser(self.lang) self.spellchecker = SpellChecker(self.lang) self._startDocument() # Looping on the subtitle blocks block = self._readBlock() while block: # Ignoring spurious subtitle blocks if block.isSpurious(): self.nbIgnoredBlocks += 1 block = self._readBlock() continue self._writeBlock(block) block = self._readBlock() self._flushDocument() self.tokeniser.close()
class BilingualConverter(SubtitleConverter): """Special converter for handling bilingual subtitles (with the first line of each block in one language, and the second line in another). """ def __init__(self, input, output, output2, rawOutput=None,rawOutput2=None, language=None,language2=None, meta=None, encoding=None, alwaysSplit=False): """Creates a new converter for a given input and output (as file objects). A second file object for the raw output can also be provided. Args: output2(file object): XML file for second language rawOutput2(file object): XML file for second language language2(Language object): second language The other arguments are similar to the SubtitleConverter. """ SubtitleConverter.__init__(self, input, output, rawOutput, language, meta, encoding, alwaysSplit) self.encodings += language2.encodings detected = detectEncoding(self.inputs[0], self.encodings) self.encodings = [detected] + self.encodings self.lang2 = language2 self.output2 = output2 self.rawOutput2 = rawOutput2 def doConversion(self): """Performs the conversion process, reading the full subtitle file and writing the converted content into the output file. """ self.text2 = "" self.nbTokens2 = 0 self.sid2 = 0 self.tokeniser2 = Tokeniser(self.lang2) self.spellchecker2 = SpellChecker(self.lang2) SubtitleConverter.doConversion(self) self.tokeniser2.close() def _startDocument(self): SubtitleConverter._startDocument(self) self._switchLanguage() SubtitleConverter._startDocument(self) self._switchLanguage() def _flushDocument(self): SubtitleConverter._flushDocument(self) self._switchLanguage() SubtitleConverter._flushDocument(self) self._switchLanguage() def _writeBlock(self, block): """Processes the block content (doing sentence segmentation and tokenisation) in the particular case of "bilingual" subtitles, where two languages are used in the subtitle (one on each line). In this setting, we always split sentences at the end of each block. """ # Loops on each line of the subtitle block for linenum in range(0,len(block.lines)): self.sentence = Sentence() self.sentence.addStamp("T%sS"%block.id, block.start) self._recordLine(block, linenum) self.sentence.addStamp("T%sE"%block.id, block.start) self._flushSentence() if not linenum: self._switchLanguage() if block.lines: self._switchLanguage() def _switchLanguage(self): self.output, self.output2 = self.output2, self.output self.rawOutput, self.rawOutput2 = self.rawOutput2, self.rawOutput self.text, self.text2 = self.text2, self.text self.nbTokens, self.nbTokens2 = self.nbTokens2, self.nbTokens self.lang, self.lang2 = self.lang2, self.lang self.sid, self.sid2 = self.sid2, self.sid self.tokeniser, self.tokeniser2 = self.tokeniser2, self.tokeniser self.spellchecker, self.spellchecker2 = self.spellchecker2, self.spellchecker def closeOutputs(self): SubtitleConverter.closeOutputs(self) if self.output2 != sys.stdout.buffer: self.output2.close() if self.rawOutput2: self.rawOutput2.close()
class SubtitleConverter: def __init__(self, input, output, rawOutput=None, language=None, meta=None, encoding=None, alwaysSplit=False): """Creates a new converter for a given input and output (as file objects). A second file object for the raw output can also be provided. Args: input(file object or list of file objects): raw subtitle files output(file object): XML subtitle file for the tokenised output rawOutput(file object): XML subtitle file for the untokenised output language(Language object): language for the subtitle meta(dict): meta-data to append to the end of the XML file(s) encoding(str): file encoding to use to read the raw subtitle files alwaysSplit(bool): whether to always split subtitle blocks as new sentences (default is false). """ self.lang = language self.alwaysSplit = alwaysSplit if self.lang and self.lang.alwaysSplit: self.alwaysSplit = True self.inputs = input if isinstance(input,list) else [input] self.encodings = [encoding] if encoding else [] self.encodings += (self.lang.encodings if self.lang else []) if not self.lang or self.lang.codes[0] in difficult_langs: detected = detectEncoding(self.inputs[0], self.encodings) self.encodings = [detected] + self.encodings self.output = output self.rawOutput = rawOutput self.meta = meta def doConversion(self): """Performs the conversion process, reading the full subtitle file and writing the converted content into the output file. """ self.curLine = None # Current line in the raw file self.curBlock = None # Current block self.curLineIndex = 0 # Current line index in the raw file self.timeOffset = 0 # Time offset (for multi-CD subtitles) self.sid = 0 # Current sentence identifier self.nbTokens = 0 # Total number of words self.nbIgnoredBlocks = 0 # Number of ignored subtitle blocks self.sentence = Sentence() # Tokens in the current sentence self.text = "" # Collection of all subtitle lines # Starting the tokeniser and spellchecker self.tokeniser = Tokeniser(self.lang) self.spellchecker = SpellChecker(self.lang) self._startDocument() # Looping on the subtitle blocks block = self._readBlock() while block: # Ignoring spurious subtitle blocks if block.isSpurious(): self.nbIgnoredBlocks += 1 block = self._readBlock() continue self._writeBlock(block) block = self._readBlock() self._flushDocument() self.tokeniser.close() def _startDocument(self): """Writes the header of the XML subtitle file. """ id = self.meta["id"] if self.meta and "id" in self.meta else "" if not id and self.inputs and hasattr(self.inputs[0],"name"): id = os.path.basename(self.inputs[0].name).split(".")[0] id = id.encode("utf-8") self.output.write(b'<?xml version="1.0" encoding="utf-8"?>\n') self.output.write(b'<document id="' + id + b'">\n') if self.rawOutput: self.rawOutput.write(b'<?xml version="1.0" encoding="utf-8"?>\n') self.rawOutput.write(b'<document id="' + id + b'">\n') def _readBlock(self, recursive=0): """Reads one subtitle block and returns it. """ block = SubtitleBlock() block.previous = self.curBlock block.offset = self.timeOffset # Reads the very first line if not self.curLine: self._readline() elif recursive > 20: raise RuntimeError("Wrong encoding format for subtitle") # Continues until a non-empty line is found while self.curLine and not self.curLine.strip(): self._readline() # If we arrive at the end of the file object, checks whether any # other file should be read (in case of multi-CD subtitles). If yes, # opens the new file and continue. Else, returns None. if not self.curLine: self.inputs.pop(0) self.curLineIndex = 0 if self.inputs: nextBlock = self._readBlock() lasttime = tosecs(block.previous.end) if block.previous else 0 # shifting the start and end times after the first CD if nextBlock and nextBlock.start and lasttime > tosecs(nextBlock.start): nextBlock.start = addsecs(nextBlock.start, lasttime-self.timeOffset) nextBlock.end = addsecs(nextBlock.end, lasttime-self.timeOffset) self.timeOffset = lasttime return nextBlock else: return None # Detects the subtitle identifier numberMatch = numberRegex.match(self.curLine) if numberMatch: block.setId(int(numberMatch.group(1))) self._readline() else: block.setId((self.curBlock.id+1) if self.curBlock else 1) # Ignores empty lines while self.curLine and not self.curLine.strip(): self._readline() # Detects the start and end time timingMatch = timingRegex.match(self.curLine) if not timingMatch: sys.stderr.write("Cannot parse timing (line number: %i): %s" %(self.curLineIndex, self.curLine)) self._readline() self.nbIgnoredBlocks += 1 return self._readBlock(recursive+1) block.setTiming(timingMatch.group(1), timingMatch.group(2)) # Reads the subtitle content until we arrive at the next subtitle ID # or the end of the file (NB: simply stopping at an empty line does # not always work, since some files strangely contain empty lines # within subtitle blocks). self._readline() while self.curLine.strip(): block.addLine(self.curLine) self._readline() while self.curLine and not numberRegex.match(self.curLine): block.addLine(self.curLine) self._readline() self.curBlock = block return block def _readline(self): """ Reads the next line in the file, decodes it according to the current encoding, and returns it. If a decoding error is detected, tries to change the encoding if an alternative is possible. """ if self.inputs: binaryLine = self.inputs[0].readline() self.curLine = None while self.curLine==None and self.encodings: encoding = self.encodings[0] try: self.curLine = binaryLine.decode(encoding) except UnicodeDecodeError: # If we get a decoding error, removes the encoding from # the list of possible encodings, and retry. self.encodings.remove(encoding) if self.curLine==None: raise RuntimeError("Decoding error (encoding: %s, line: %i)" %(encoding, self.curLineIndex)) elif self.curLineIndex==0: self.curLine = self.curLine.lstrip("\ufeff") self.curLineIndex += 1 def _writeBlock(self, block): """ Processes the block content by doing sentence segmentation, tokenisation, and writes the results into the XML file. """ # First check whether the block is a continuation of the previous # sentence. If not, "flush" the current sentence to start a new one. if not self._isContinuation(block): self._flushSentence() self.sentence.addStamp("T%sS"%block.id, block.start) # Loops on each line of the subtitle block for linenum in range(0,len(block.lines)): self.sentence.addRawChar(' ' if self.sentence.raw else '') self._recordLine(block, linenum) self.sentence.addStamp("T%sE"%block.id, block.end) def _recordLine(self, block, linenum): """ Records the subtitle line, checking for the occurrence of end-of-sentence markers along the way, and flushing the current sentence in that case. """ # Doing the actual tokenisation line = block.lines[linenum] tokens = self.tokeniser.tokenise(line) curPos = 0 # Current character position in the line upperline = len([c for c in line if c.isupper() or not c.isalpha()]) > 2*len(line)/3 for i, token in enumerate(tokens): curPos += len(token) # Assume a new sentence if an utterance started with "-" is found if (token=="-" and i < len(tokens)-1 and (tokens[i+1][0].isupper() or (self.lang and self.lang.unicase))): self._flushSentence() # Handle all-uppercase tokens emphasised = block.isEmphasised(linenum, curPos) prev = self.sentence.lastToken if token.isupper() and ((not token.istitle() and self.spellchecker.lm) or upperline): corrected = self.spellchecker.recapitalise(token, prev, upperline) if corrected != token: self.sentence.addToken(token, emphasised|(not upperline), alternative=corrected) else: self.sentence.addToken(token, emphasised) # Usual case else: corrected, prob = self.spellchecker.spellcheck(token, prev) if prev in stopPunctuations2 and corrected.istitle(): self._flushSentence() emphasised = block.isEmphasised(linenum, curPos) if corrected == token: self.sentence.addToken(token, emphasised) elif prob > 0.8: self.sentence.addToken(corrected, emphasised, initial=token) else: self.sentence.addToken(token, emphasised, alternative=corrected) while curPos < len(line) and line[curPos].isspace(): self.sentence.addRawChar(line[curPos]) curPos += 1 # Do not flush the sentence for the last token in the last line if ((linenum==len(block.lines)-1 and i==len(tokens)-1) or (i < len(tokens)-1 and tokens[i+1]=="\"")): continue if token[0] in stopPunctuations1: self._flushSentence() elif (token[0] in stopPunctuations2 and i > 0 and (i==len(tokens)-1 or tokens[i+1][0].isupper() or tokens[i+1][0]=="l" or (self.lang and self.lang.unicase))): self._flushSentence() def _isContinuation(self, block): """Returns true if the block is likely to be a continuation of the current sentence """ if (not self.sentence or not block.lines or not block.previous or not block.previous.lines): return True elif self.alwaysSplit: return False score = 0 #Initial continuation score # Scoring based on the end of the previous block lastline = block.previous.lines[-1].rstrip(")]} ") stopEndings = stopPunctuations1 + stopPunctuations2 + ["\""] if lastline.endswith("..."): score += 2 elif lastline and lastline[-1] in stopEndings: score += -3 # Scoring based on the beginning of the current block newline = block.lines[0].lstrip("'[*# ") if not newline: score += -2 elif lastline.endswith("-") and newline.startswith("-"): score += 2 elif newline[0] in ["-","\"", "¿", "¡", "'"]: score += -2 elif newline.startswith("..."): score += 2 elif newline[0].isupper(): score += -3 elif newline[0].islower(): score += 2 elif newline[0].isnumeric() or (self.lang and self.lang.unicase): score += 1 # Scoring based on time gaps if block.start and block.previous.end: pause = tosecs(block.start) - tosecs(block.previous.end) score += (-1 if pause > PAUSE_THR1 else 0) score += (-1 if pause > PAUSE_THR2 else 0) # Scoring based on sentence lengths score += (-1 if self.sentence.getNbStamps() >3 else 0) score += (-1 if self.sentence.getNbTokens() > WORDS_THR else 0) return True if score > 0 else False def _flushSentence(self): """ Writes the tokens to the XML file (and the untokenised output if that option is activated) and clears the current sentence. """ nbTokens = self.sentence.getNbTokens() if not nbTokens: return self.nbTokens += nbTokens self.sid += 1 self._pruneTokens() self._writeTokens() if self.rawOutput: self._writeRaw() # We record the text content for language identification purposes self.text += self.sentence.rawCorrected + "\n" self.sentence = Sentence() def _pruneTokens(self): entities= self.sentence.entities for i in range(1, len(entities)-4): if (entities[i][0]=="w" and (entities[i][1]=="..." or entities[i][1]=="-") and entities[i+1][0]=="time" and entities[i+2][0]=="time" and entities[i+3][0]=="w" and entities[i+3][1]==entities[i][1]): self.sentence.entities = entities[0:i] +entities[i+1:i+3] + entities[i+4:] self.sentence.raw = self.sentence.raw.replace("... ...", " ") self.sentence.raw = self.sentence.raw.replace("- -", " ") break def _writeTokens(self): """ Writes the tokens in self.sentence to the XML file. """ builder = et.TreeBuilder() sattrs = {"id":str(self.sid)} if self.sentence.isEmphasised(): sattrs.update({"emphasis":"true"}) for w in self.sentence.getTokens(): del w[2]["emphasis"] builder.start("s",sattrs) tokid = 0 entities= self.sentence.getEntities() for i, entity in enumerate(entities): if entity[0]=="w": token = entity[1] tokid += 1 builder.data("\n ") wattrs = {"id":"%i.%i"%(self.sid,tokid)} wattrs.update(entity[2]) builder.start("w",wattrs) builder.data(token) builder.end("w") # Write a <time> entity elif entity[0]=="time": builder.data("\n ") builder.start("time",entity[1]) builder.end("time") builder.data("\n ") builder.end("s") tree = et.ElementTree(builder.close()) self.output.write(b" ") tree.write(self.output, encoding='utf-8') self.output.write(b"\n") def _writeRaw(self): """ Writes the raw sentence to the XML file. """ builder = et.TreeBuilder() attrs = {"id":str(self.sid)} builder.start("s",attrs) # Add timing info at the beginning of the sentence entities = self.sentence.getEntities() if entities and entities[0][0] == "time": builder.data("\n ") builder.start("time",entities[0][1]) builder.end("time") builder.data("\n") builder.data(self.sentence.raw) # Add timing info at the end of the sentence if entities and entities[-1][0] == "time": builder.data("\n ") builder.start("time",entities[-1][1]) builder.end("time") builder.data("\n ") builder.end("s") tree = et.ElementTree(builder.close()) self.rawOutput.write(b" ") tree.write(self.rawOutput, encoding='utf-8') self.rawOutput.write(b"\n") def _extractMetadata(self): """ Extracts meta-data on the subtitle and the conversion process, in order to append it to the end of the XML file. """ meta = self.meta if self.meta else {} if "id" in meta: del meta["id"] meta["subtitle"] = meta["subtitle"] if "subtitle" in meta else {} meta["conversion"] = {} if self.lang: meta["subtitle"]["language"] = self.lang.name # Performs language identification langProb = self.lang.getProb(self.text) if langProb < 0.1 and not isinstance(self, BilingualConverter): msg = "Subtitle is not encoded in " + self.lang.name msg += " (distrib: " + str(utils.getProbDist(self.text)) + ")" raise RuntimeError(msg) meta["subtitle"]["confidence"] = str(langProb) if self.curBlock: meta["subtitle"]["blocks"] = str(self.curBlock.id) meta["subtitle"]["duration"] = self.curBlock.end meta["conversion"]["sentences"] = str(self.sid) meta["conversion"]["tokens"] = str(self.nbTokens) meta["conversion"]["encoding"] = self.encodings[0] meta["conversion"]["ignored_blocks"] = str(self.nbIgnoredBlocks) sc = self.spellchecker meta["conversion"]["unknown_words"] = str(sc.nbUnknowns) meta["conversion"]["corrected_words"] = str(sc.nbCorrections) meta["conversion"]["truecased_words"] = str(sc.nbTruecased) return meta def _flushDocument(self): """ Adds the final meta-data to the XML file, and closes the XML document. """ self._flushSentence() meta = self._extractMetadata() metaBuilder = et.TreeBuilder() metaBuilder.start("meta") for part in meta: metaBuilder.data("\n ") metaBuilder.start(part) if isinstance(meta[part],dict): for key in meta[part]: metaBuilder.data("\n ") metaBuilder.start(key) metaBuilder.data(meta[part][key]) metaBuilder.end(key) metaBuilder.data("\n ") metaBuilder.end(part) metaBuilder.data("\n ") metaBuilder.end("meta") tree = et.ElementTree(metaBuilder.close()) for fd in [self.output,self.rawOutput]: if fd: fd.write(b" ") tree.write(fd, encoding='utf-8') fd.write(b"\n</document>\n") def closeOutputs(self): if self.output != sys.stdout.buffer: self.output.close() if self.rawOutput: self.rawOutput.close()