def _load_ass(self): if self.ontology["kind"] == "ass": content = self._read() if content: caption = Caption(self.env) content = content.splitlines() index = 0 formation = None for line in content: if line == u"[Events]": match = self.env.expression["ass formation line"].search(content[index + 1]) if match is not None: formation = match.group(1).strip().replace(u" ", u"").split(u",") break index += 1 if formation is not None: start = formation.index("Start") stop = formation.index("End") text = formation.index("Text") for line in content: match = self.env.expression["ass subtitle line"].search(line) if match is not None: line = match.group(1).strip().split(",") slide = Slide() slide.begin.timecode = line[start] slide.end.timecode = line[stop] subtitle_text = u",".join(line[text:]) subtitle_text = self.env.expression["ass event command"].sub( self.env.constant["empty string"], subtitle_text ) subtitle_text = subtitle_text.replace(u"\n", ur"\N") subtitle_text = self.env.expression["ass condense line breaks"].sub(ur"\N", subtitle_text) subtitle_text = subtitle_text.split(ur"\N") for line in subtitle_text: slide.add(line) caption.add(slide) caption.normalize() if caption.valid: mtype = self.env.enumeration["mediainfo stream type"].find("text") o = Ontology(self.env, mtype.node["namespace"]) o["stream type"] = u"text" o["format"] = u"ASS" o["language"] = self.ontology["language"] o["content"] = caption.node self._execution["crawl"]["stream"].append(o)
def _load_srt(self): if self.ontology["kind"] == "srt": content = self._read() if content: caption = Caption(self.env) content = content.splitlines() current_slide_pointer = None next_slide_pointer = None current = None next = None last_line = len(content) - 1 for index in range(len(content)): if index == last_line and current_slide_pointer is not None: # This is the last line next_slide_pointer = index + 1 match = self.env.expression["srt time line"].search(content[index]) if match is not None and content[index - 1].strip().isdigit(): next = Slide() next.begin.timecode = match.group(1) next.end.timecode = match.group(2) if current_slide_pointer is not None: next_slide_pointer = index - 1 else: # first block current_slide_pointer = index - 1 current = next next = None if next_slide_pointer is not None: for line in content[current_slide_pointer + 2 : next_slide_pointer]: current.add(line) caption.add(current) current_slide_pointer = next_slide_pointer next_slide_pointer = None current = next next = None caption.normalize() if caption.valid: mtype = self.env.enumeration["mediainfo stream type"].find("text") o = Ontology(self.env, mtype.node["namespace"]) o["stream type"] = u"text" o["format"] = u"UTF-8" o["language"] = self.ontology["language"] o["content"] = caption.node self._execution["crawl"]["stream"].append(o)