Example #1
0
def middleware_convert_sub(response, **kwargs):
    data = response.stream.content.decode('utf8')
    reader = detect_format(data)
    if reader:
        data = WebVTTWriter().write(reader().read(data))
        response.stream.content = data.encode('utf8')
        response.headers['content-type'] = 'text/vtt'
Example #2
0
    def test_lang_option(self, sample_webvtt_multi_lang_en,
                         sample_webvtt_multi_lang_de,
                         sample_sami_with_multi_lang):
        caption_set = SAMIReader().read(sample_sami_with_multi_lang)
        results = WebVTTWriter().write(caption_set, 'de-DE')

        assert sample_webvtt_multi_lang_de == results
        results = WebVTTWriter().write(caption_set, 'en-US')
        assert sample_webvtt_multi_lang_en == results
Example #3
0
def _webvtt(url, _data_path, _headers, **kwargs):
    r = Session().get(url, headers=_headers)

    data = r.content.decode('utf8')
    reader = detect_format(data)

    data = WebVTTWriter().write(reader().read(data))
    with open(_data_path, 'wb') as f:
        f.write(data.encode('utf8'))

    return _data_path + '|content-type=text/vtt'
Example #4
0
 def __init__(self, readers, caption_str):
     """
     :param readers: An array of `SubtitleReader` instances
     :param caption_str: A string with the captions content
     """
     self.readers = readers
     self.caption_str = caption_str
     self.writer = WebVTTWriter()
     # set "video size" to 100 since other types may have layout, 100 should work to generate %
     self.writer.video_width = 100
     self.writer.video_height = self.writer.video_width * 6 / 19
     self.caption_set = None
Example #5
0
def middleware_convert_sub(response, **kwargs):
    data = response.stream.content.decode('utf8')
    reader = detect_format(data)
    if reader:
        data = WebVTTWriter().write(reader().read(data))
        if ADDON_DEV:
            path = 'special://temp/convert_sub.middleware'
            real_path = xbmc.translatePath(path)
            with open(real_path, 'wb') as f:
                f.write(data.encode('utf8'))
        response.stream.content = data.encode('utf8')
        response.headers['content-type'] = 'text/vtt'
Example #6
0
    def test_break_node_positioning_is_ignored(
            self, webvtt_from_dfxp_with_conflicting_align,
            dfxp_style_region_align_conflict):
        caption_set = DFXPReader().read(dfxp_style_region_align_conflict)
        results = WebVTTWriter().write(caption_set)

        assert webvtt_from_dfxp_with_conflicting_align == results
 def test_dfxp_with_positioning_to_webvtt_conversion(self):
     caption_set = DFXPReader().read(SAMPLE_DFXP_WITH_POSITIONING)
     results = WebVTTWriter(video_width=VIDEO_WIDTH,
                            video_height=VIDEO_HEIGHT).write(caption_set)
     self.assertTrue(isinstance(results, str))
     self.assertWebVTTEquals(
         SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING_AND_STYLE, results)
    def test_positioning_is_kept(self,
                                 sample_webvtt_from_dfxp_with_positioning):
        caption_set = WebVTTReader().read(
            sample_webvtt_from_dfxp_with_positioning)
        results = WebVTTWriter().write(caption_set)

        assert sample_webvtt_from_dfxp_with_positioning == results
Example #9
0
    def test_dfxp_to_webvtt_conversion(self, sample_webvtt_from_dfxp,
                                       sample_dfxp):
        caption_set = DFXPReader().read(sample_dfxp)
        results = WebVTTWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_webvtt_equals(sample_webvtt_from_dfxp, results)
 def load_subtitles(self, video_id, langs=('ru',)):
     for lang in langs:
         if subs_exists(video_id, lang):
             continue
         print('Loading {} subtitles for {}'.format(lang, video_id))
         opts = {
             'writeautomaticsub': True,
             'subtitleslangs': langs,
             'subtitlesformat': 'ttml',
             'nooverwrites': True,
             'skip_download': True,
             'outtmpl': join(get_dir(video_id), video_id + '.ttml')
         }
         with youtube_dl.YoutubeDL(opts) as ytdl:
             ytdl.download(['https://www.youtube.com/watch?v={}'.format(video_id)])
         # WevVTT captions from youtube contains duplicate phrases with overlapping time segments
         # It is not comfortable, that's why subtitles firstly downloaded in ttml format
         # Then subtitles converted to webvtt
         subs_path_ttml = join(get_dir(video_id), video_id + '.' + lang + '.ttml')
         subs_path_vtt = join(get_dir(video_id), video_id + '.' + lang + '.vtt')
         if exists(subs_path_ttml):
             print('converting subtitles')
             with open(subs_path_ttml, encoding='utf-8') as f:
                 subs = DFXPReader().read(f.read())
             with open(subs_path_vtt, 'w', encoding='utf-8') as f:
                 f.write(WebVTTWriter().write(subs))
Example #11
0
    def test_webvtt_newlines_are_properly_rendered(self):
        caption_set = SCCReader().read(
            SCC_THAT_GENERATES_WEBVTT_WITH_PROPER_NEWLINES)
        webvtt = WebVTTWriter().write(caption_set)

        self.assertEqual(
            webvtt, SAMPLE_WEBVTT_FROM_SCC_PROPERLY_WRITES_NEWLINES_OUTPUT)
Example #12
0
 def test_dfxp_to_webvtt_preserves_proper_alignment(self):
     # This failed at one point when the CaptionSet had node breaks with
     # different positioning. It was fixed both at the DFXPReader AND the
     # WebVTTWriter.
     caption_set = DFXPReader().read(DFXP_STYLE_REGION_ALIGN_CONFLICT)
     results = WebVTTWriter().write(caption_set)
     self.assertEquals(WEBVTT_FROM_DFXP_WITH_CONFLICTING_ALIGN, results)
    def test_srt_to_webvtt_conversion(self, sample_webvtt_from_srt,
                                      sample_srt):
        caption_set = SRTReader().read(sample_srt)
        results = WebVTTWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_webvtt_equals(sample_webvtt_from_srt, results)
Example #14
0
    def test_dfxp_to_webvtt_adds_explicit_size(
            self, sample_webvtt_output_long_cue, sample_dfxp_long_cue):
        caption_set = DFXPReader().read(sample_dfxp_long_cue)
        results = WebVTTWriter().write(caption_set)

        assert isinstance(results, str)
        assert sample_webvtt_output_long_cue == results
Example #15
0
class WebVTTWriterTestCase(unittest.TestCase):
    def setUp(self):
        self.writer = WebVTTWriter()

    def test_double_br(self):
        captions = SAMIReader().read(SAMPLE_SAMI_DOUBLE_BR.decode(u'utf-8'))
        self.assertEqual(SAMPLE_WEBVTT_DOUBLE_BR.decode(u'utf-8'),
                         self.writer.write(captions))
    def test_sami_to_webvtt_conversion(self, sample_webvtt_from_sami,
                                       sample_sami):
        caption_set = SAMIReader().read(sample_sami)
        results = WebVTTWriter(video_width=640,
                               video_height=360).write(caption_set)

        assert isinstance(results, str)
        self.assert_webvtt_equals(sample_webvtt_from_sami, results)
    def test_webvtt_newlines_are_properly_rendered(
            self, sample_webvtt_from_scc_properly_writes_newlines_output,
            scc_that_generates_webvtt_with_proper_newlines):
        caption_set = SCCReader().read(
            scc_that_generates_webvtt_with_proper_newlines)
        webvtt = WebVTTWriter().write(caption_set)

        assert webvtt == sample_webvtt_from_scc_properly_writes_newlines_output
Example #18
0
class WebVTTWriterTestCase(unittest.TestCase):

    def setUp(self):
        self.writer = WebVTTWriter()

    def test_double_br(self):
        captions = SAMIReader().read(SAMPLE_SAMI_DOUBLE_BR.decode(u'utf-8'))
        self.assertEqual(SAMPLE_WEBVTT_DOUBLE_BR.decode(u'utf-8'), self.writer.write(captions))
def convert_subs_to_vtt(input_subs_path, output_vtt_path):
    with open(input_subs_path, 'r') as f:
        text = f.read().decode(utils.get_file_encoding(input_subs_path))
        reader = detect_format(text)
        subs = reader().read(text)

        output_text = WebVTTWriter().write(subs)

        with open(output_vtt_path, 'w') as w:
            w.write(output_text)
Example #20
0
    def test_dfxp_to_webvtt_preserves_proper_alignment(
            self, webvtt_from_dfxp_with_conflicting_align,
            dfxp_style_region_align_conflict):
        # This failed at one point when the CaptionSet had node breaks with
        # different positioning. It was fixed both at the DFXPReader AND the
        # WebVTTWriter.
        caption_set = DFXPReader().read(dfxp_style_region_align_conflict)
        results = WebVTTWriter().write(caption_set)

        assert webvtt_from_dfxp_with_conflicting_align == results
Example #21
0
def fetch_subtitles(entry, lang='ru'):
    requested_subtitles = entry['automatic_captions']
    if requested_subtitles:
        title = entry['title']
        video_id = entry['id']
        url = requested_subtitles[lang][0]['url']

        text = requests.get(url).content.decode()
        vtt = WebVTTWriter().write(DFXPReader().read(text))

        return video_id, title, vtt
Example #22
0
    def test_dfxp_with_positioning_to_webvtt_conversion(
            self, sample_webvtt_from_dfxp_with_positioning_and_style,
            sample_dfxp_with_positioning):
        caption_set = DFXPReader().read(sample_dfxp_with_positioning)
        results = WebVTTWriter(
            video_width=VIDEO_WIDTH, video_height=VIDEO_HEIGHT
        ).write(caption_set)

        assert isinstance(results, str)
        self.assert_webvtt_equals(
            sample_webvtt_from_dfxp_with_positioning_and_style, results
        )
Example #23
0
def route_subtitles(course_id, lecture_id):
    subtitles_url = (
            'https://class.coursera.org/%s-001/lecture/subtitles?q=%d_en' %
            (course_id, lecture_id))
    r = requests.get(subtitles_url)
    try:
        converter = CaptionConverter()
        converter.read(r.text, SRTReader())
        subtitles = converter.write(WebVTTWriter())
    except CaptionReadNoCaptions:
        subtitles = ''
    return Response(subtitles, content_type='text/vtt')
Example #24
0
    def convert_caps_to_vtt(caps):
        """
        Utility method to convert any supported transcripts into WebVTT format.

        Supported input formats: DFXP/TTML - SAMI - SCC - SRT - WebVTT.

        Arguments:
            caps (unicode): Raw transcripts.
        Returns:
            unicode: Transcripts converted into WebVTT format.
        """
        if caps:
            reader = detect_format(caps)
            if reader:
                return WebVTTWriter().write(reader().read(caps))
        return u''
Example #25
0
def subtitle(request, title, no):
    t = re.sub('\(.*?\)', '', title)[:-1]
    film = subscene.search(t, "English")

    zip = requests.get(subscene.zipped_url(film.subtitles[int(no)]))

    fp = StringIO(zip.content)
    archive = zipfile.ZipFile(fp, 'r')
    srt = archive.read(archive.namelist()[0])
    soup = BeautifulSoup(srt)
    # print(soup.originalEncoding)
    converter = CaptionConverter()
    unistring = unicode(srt.decode(soup.originalEncoding))
    if "utf-8" in soup.originalEncoding:
        unistring = unistring[1:]
    converter.read(unistring, SRTReader())
    html_parser = HTMLParser.HTMLParser()

    return HttpResponse(html_parser.unescape(converter.write(WebVTTWriter()).encode('ascii', 'ignore')),
                        content_type="text/vtt")
Example #26
0
def convert_subtitles_to_vtt(input_file: str, output_file: str):
    """Convert .srt subtitles to .vtt for web playback."""
    logger.info(f'Converting {input_file} to {output_file}')
    with open(input_file, mode='rb') as raw_input_content:
        encoding = chardet.detect(raw_input_content.read())['encoding']

    with open(input_file, mode='r', encoding=encoding) as srt_file:
        srt_contents = str(srt_file.read())

    converter = CaptionConverter()
    try:
        converter.read(srt_contents, SRTReader())
    except CaptionReadNoCaptions:
        logger.exception(f'Failed to convert {input_file} to {output_file}')
        return False  # Likely UTF-16 subtitles
    vtt_captions = converter.write(WebVTTWriter())

    with open(output_file, mode='w', encoding='utf-8-sig') as vtt_file:
        vtt_file.write(vtt_captions)

    return True
Example #27
0
def from_srt(input_f, output_f):
  """
    Takes an input SRT file or filename and writes out VTT contents to the given 
    output file or filename
  """
  with vtt_open(input_f, 'r') as f:
    orig = f.read()

    detect = chardet.detect(orig)
    encoding = detect['encoding']
    confidence = detect['confidence']
    default_subrip_encoding = 'cp1252' # standard for SubRip files

    if confidence < 0.9:
      encoding = default_subrip_encoding

    backups = [default_subrip_encoding,'utf8']

    while True:
      try:
        print "ENCODING: " + encoding
        contents = orig.decode(encoding)
        break
      except UnicodeDecodeError as e:
        if len(backups) is 0:
          raise
          break
        encoding = backups.pop(0)


    # caption converter seems to have a tough time with the BOM on
    # Python < 2.7.8, so ditch it if it exists.
    contents = contents[3:] if contents[:3] == codecs.BOM_UTF8 else contents

  converter = CaptionConverter()
  converter.read(contents, SRTReader())
  contents = converter.write(WebVTTWriter())

  with vtt_open(output_f, 'w') as o:
    o.write(contents.encode('utf-8')[:-1])
Example #28
0
def run_pipeline(url=None,
                 hmm=None,
                 lm=None,
                 dict=None,
                 caption_format='webvtt',
                 out_file=None):
    if url is None:
        raise Exception('No URL specified!')
    pipeline = Gst.parse_launch('uridecodebin name=source ! audioconvert !' +
                                ' audioresample ! pocketsphinx name=asr !' +
                                ' fakesink')
    source = pipeline.get_by_name('source')
    source.set_property('uri', url)
    pocketsphinx = pipeline.get_by_name('asr')
    if hmm:
        pocketsphinx.set_property('hmm', hmm)
    if lm:
        pocketsphinx.set_property('lm', lm)
    if dict:
        pocketsphinx.set_property('dict', dict)

    bus = pipeline.get_bus()

    # Start playing
    pipeline.set_state(Gst.State.PLAYING)

    cap_set = CaptionSet()
    captions = []

    # Wait until error or EOS
    while True:
        try:
            msg = bus.timed_pop(Gst.CLOCK_TIME_NONE)
            if msg:
                #if msg.get_structure():
                #    print(msg.get_structure().to_string())

                if msg.type == Gst.MessageType.EOS:
                    break
                struct = msg.get_structure()
                if struct and struct.get_name() == 'pocketsphinx':
                    if struct['final']:
                        c = Caption()
                        c.start = struct['start_time'] / Gst.USECOND
                        c.end = struct['end_time'] / Gst.USECOND
                        c.nodes.append(
                            CaptionNode.create_text(struct['hypothesis']))
                        captions.append(c)
        except KeyboardInterrupt:
            pipeline.send_event(Gst.Event.new_eos())

    # Free resources
    pipeline.set_state(Gst.State.NULL)

    cap_set.set_captions('en-US', captions)
    writer = SRTWriter() if caption_format == 'srt' else WebVTTWriter()
    caption_data = writer.write(cap_set)
    if out_file is not None:
        codecs.open(out_file, 'w', 'utf-8').write(caption_data)
    else:
        print(caption_data)
 def test_empty_cues_are_deleted(self):
     caption_set = WebVTTReader().read(SAMPLE_WEBVTT_EMPTY_CUE)
     results = WebVTTWriter().write(caption_set)
     self.assertEqual(SAMPLE_WEBVTT_FROM_EMPTY_CUE, results)
 def test_positioning_is_kept(self):
     caption_set = WebVTTReader().read(
         SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING)
     results = WebVTTWriter().write(caption_set)
     self.assertEqual(SAMPLE_WEBVTT_FROM_DFXP_WITH_POSITIONING, results)
    def test_cue_settings_are_kept(self):
        caption_set = WebVTTReader().read(SAMPLE_WEBVTT_WITH_CUE_SETTINGS)

        webvtt = WebVTTWriter().write(caption_set)

        self.assertEqual(SAMPLE_WEBVTT_WITH_CUE_SETTINGS, webvtt)
Example #32
0
 def setUp(self):
     self.writer = WebVTTWriter()