Example #1
0
    def input_file(self):
        if not hasattr(self, '_source_file'):
            with open(self.arguments.file, 'rb') as f:
                content = f.read()
                encoding = detect(content).get('encoding')
                encoding = self.normalize_encoding(encoding)

            self._source_file = WebVTTFile.open(self.arguments.file,
                                                encoding=encoding, error_handling=WebVTTFile.ERROR_LOG)
        return self._source_file
Example #2
0
def get_webvttfile(file_obj):
    """Get a WebVTTFile instance from a file-like object. """

    file_obj.seek(0)
    contents = file_obj.read()

    # convert to unicode if it's a plain str
    if not isinstance(contents, unicode):  # NOQA ignore F821
        contents = codecs.decode(contents, 'utf-8')

    return WebVTTFile.from_string(contents)
Example #3
0
    def input_file(self):
        if not hasattr(self, '_source_file'):
            with open(self.arguments.file, 'rb') as f:
                content = f.read()
                encoding = detect(content).get('encoding')
                encoding = self.normalize_encoding(encoding)

            self._source_file = WebVTTFile.open(
                self.arguments.file,
                encoding=encoding,
                error_handling=WebVTTFile.ERROR_LOG)
        return self._source_file
Example #4
0
def merge_subtitle(sub_a, sub_b, delta):
    out = WebVTTFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in xrange(1, len(intervals)):
        start = WebVTTTime.from_ordinal(intervals[i - 1])
        end = WebVTTTime.from_ordinal(intervals[i])

        if (end-start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = WebVTTItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
Example #5
0
def merge_subtitle(sub_a, sub_b, delta):
    out = WebVTTFile()
    intervals = [item.start.ordinal for item in sub_a]
    intervals.extend([item.end.ordinal for item in sub_a])
    intervals.extend([item.start.ordinal for item in sub_b])
    intervals.extend([item.end.ordinal for item in sub_b])
    intervals.sort()

    j = k = 0
    for i in xrange(1, len(intervals)):
        start = WebVTTTime.from_ordinal(intervals[i - 1])
        end = WebVTTTime.from_ordinal(intervals[i])

        if (end - start) > delta:
            text_a, j = find_subtitle(sub_a, start, end, j)
            text_b, k = find_subtitle(sub_b, start, end, k)

            text = join_lines(text_a, text_b)
            if len(text) > 0:
                item = WebVTTItem(0, start, end, text)
                out.append(item)

    out.clean_indexes()
    return out
def convert_subs(vtt_filename, font="", size=""):
    output_filename = vtt_filename
    try:
        subs = WebVTTFile.open(vtt_filename)
        output_filename = vtt_filename.rstrip('.vtt') + ".ass"
    except InvalidFile:
        my_log("Not a VTT file.", xbmc.LOGDEBUG)
        subs = None
    except IOError:
        my_log("File not found.", xbmc.LOGDEBUG)
        subs = None

    #Internal rendering resolution used for scaling. Messing with this affects font sizes, etc.
    def_res = (720, 480)
    #Offset used for correcting the output.
    offset = (0, -45)
    #File header
    ass_header_temp = "[Script Info]\n" \
                      "; This is an Advanced Sub Station Alpha v4+ script.\n" \
                      "Title: converted from vtt\n" \
                      "ScriptType: v4.00+\n" \
                      "Collisions: Normal\n" \
                      "PlayDepth: 0\n" \
                      "PlayResX: {}\n" \
                      "PlayResY: {}\n\n" \
                      "[V4+ Styles]\n" \
                      "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, " \
                      "Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, " \
                      "Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n"

    ass_header = ass_header_temp.format(def_res[0], def_res[1])

    #Style line template
    line_template = "Style: {Name},{Font},{Fontsize},{PrimaryColour},{SecondaryColour},{OutlineColour},{BackColour}," \
                    "{Bold},{Italic},{Underline},{StrikeOut},{ScaleX},{ScaleY},{Spacing},{Angle},{BorderStyle}," \
                    "{Outline},{Shadow},{Alignment},{MarginL},{MarginR},{MarginV},{Encoding}\n"

    #Event header template
    event_header = "[Events]\n" \
                   "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"

    #Event line template
    event_template = "Dialogue: {Layer},{Start},{End},{Style},{Name},{MarginL},{MarginR},{MarginV},{Effect},{Text}\n"

    if not font:
        font = "Arial"
    if not size:
        size = "24"
    #Setup initial values for the styles
    initial_font_settings = {
        'Font': font,
        "Fontsize": size,
        'PrimaryColour': "&H00FFFFFF",  #NOTE: this is AABBGGRR hex notation
        'SecondaryColour': "&H0300FFFF",
        'OutlineColour': "&H00000000",
        'BackColour': "&H02000000",
        'Bold': "0",
        'Italic': "0",
        'Underline': "0",
        'StrikeOut': "0",
        'ScaleX': "100",
        'ScaleY': "100",
        'Spacing': "0",
        'Angle': "0",
        'BorderStyle': "1",
        'Outline': "2",
        'Shadow': "1",
        'Alignment': "2",
        'MarginL': "0",
        'MarginR': "0",
        'MarginV': "0",
        'Encoding': "1"
    }

    styles = dict()

    styles['dialogue'] = dict(initial_font_settings)
    styles['dialogue'][
        'PrimaryColour'] = "&H0000FFFF"  #set the color to yellow
    styles['dialogue']['Name'] = 'dialogue'

    styles['song_lyrics'] = dict(initial_font_settings)
    styles['song_lyrics'][
        'PrimaryColour'] = "&H00FFFF00"  # set the color to blue
    styles['song_lyrics']['Name'] = 'song_lyrics'

    styles['captions'] = dict(initial_font_settings)
    #copy the initial values, but don't make changes. reserved for future use

    if subs:
        ass_fh = open(output_filename, 'wb')
        #write out the header and the dialogue style
        ass_fh.write(ass_header)
        ass_fh.write(line_template.format(**styles['dialogue']))
        ass_fh.write(line_template.format(**styles['song_lyrics']))
        #find the 'special' sub blocks that specify an alignment
        for item in subs.data:
            if "align" in item.position or "Caption" in item.text or "caption" in item.text:
                #tweak the alignment in the styles (can't set alignment in events)
                # "1" is bottom left, "3" is bottom right (like numpad)
                if "align:left" in item.position:
                    #it's probably not neccessary to do the .replace here
                    styles['captions']['Name'] = item.index.replace('-', '_')
                    styles['captions']['Alignment'] = "1"
                    ass_fh.write(line_template.format(**styles['captions']))
                elif "align:right" in item.position:
                    styles['captions']['Name'] = item.index.replace('-', '_')
                    styles['captions']['Alignment'] = "3"
                    ass_fh.write(line_template.format(**styles['captions']))
                else:
                    styles['captions']['Name'] = item.index.replace('-', '_')
                    styles['captions']['Alignment'] = "2"
                    ass_fh.write(line_template.format(**styles['captions']))

        ass_fh.write("\n\n")
        ass_fh.write(event_header)
        #write out the subtitles: ASS calls these events, VTT has these stored in <c> tags
        for item in subs.data:
            abs_vpos = 10  # don't want the 'default' margin to have the subtitles at
            # the absolute edge of the screen
            abs_hpos = 0
            pos_parts = item.position.split()
            for item_pos in pos_parts:
                #vtt uses percentages, ass uses pixels. convert
                if 'line' in item_pos:
                    # vtt's 'line' is percentage from top of screen (usually)
                    item_pos_per = item_pos.split(':')[1].rstrip('%')
                    per_float = float(item_pos_per) / 100
                    abs_vpos = per_float * def_res[1]
                    abs_vpos = def_res[1] - abs_vpos + offset[1]
                    abs_vpos = int(abs_vpos)
                if 'position' in item_pos:
                    # while 'position' is percentage from left of screen (usually)
                    item_pos_per = item_pos.split(':')[1].rstrip('%')
                    per_float = float(item_pos_per) / 100
                    abs_hpos = per_float * def_res[0]
                    abs_hpos = abs_hpos + offset[0]
                    abs_hpos = int(abs_hpos)
            item_text = item.text_without_tags.encode('utf-8')
            #handle the timecodes, need to chop off leading 0 and trailing ms position
            if '.' in item.start.to_time().isoformat():
                #isoformat doesn't print trailing zeros in ms position,
                #so we need to account for this. in this case we have ms's
                start_text = item.start.to_time().isoformat()[1:-4]
            else:  # we add trailing zero's back
                start_text = item.start.to_time().isoformat()[1:] + '.00'
            if '.' in item.end.to_time().isoformat():
                end_text = item.end.to_time().isoformat()[1:-4]
            else:
                end_text = item.end.to_time().isoformat()[1:] + '.00'

            #create the events, matching the styles to what we used before
            if "caption" in item.text or "Caption" in item.text:
                event = {
                    'Layer': "0",
                    'Start': start_text,
                    'End': end_text,
                    'Style': item.index.replace('-', '_'),
                    'Name': item.index,
                    'MarginL': abs_hpos,
                    'MarginR': "0",
                    'MarginV': abs_vpos,
                    'Effect': "",
                    'Text': item_text
                }
            elif "song" in item.text or "Song" in item.text:
                event = {
                    'Layer': "0",
                    'Start': start_text,
                    'End': end_text,
                    'Style': "song_lyrics",
                    'Name': item.index,
                    'MarginL': abs_hpos,
                    'MarginR': "0",
                    'MarginV': abs_vpos,
                    'Effect': "",
                    'Text': item_text
                }
            else:
                event = {
                    'Layer': "0",
                    'Start': start_text,
                    'End': end_text,
                    'Style': "dialogue",
                    'Name': item.index,
                    'MarginL': abs_hpos,
                    'MarginR': "0",
                    'MarginV': abs_vpos,
                    'Effect': "",
                    'Text': item_text
                }

            ass_fh.write(event_template.format(**event))
        ass_fh.close()
    return output_filename
def main(options):
    # Ensure ffmpeg is around
    if not run_ffmpeg(['-version']):
        log.error(
            "ffmpeg needs to be available to strip audio from the video file.")
        exit(1)

    with NamedTemporaryFile(delete=True) as vid_file:
        log.info("Downloading %s - this might take a while." % options.vid_url)
        response = get(options.vid_url, stream=True)
        total_length = response.headers.get("content-length")
        if total_length is None:  # no content length header
            log.info("Unknown length - can't predict how long this will take.")
            f.write(response.content)
        else:
            bar = ProgressBar(max_value=int(total_length))
            dl = 0
            for data in response.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
                dl += len(data)
                vid_file.write(data)
                vid_file.flush()
                bar.update(dl)

        log.info("Download done. Stripping audio.")
        (wav_file, wav_file_name) = mkstemp('.wav')
        result = run_ffmpeg([
            "-y", "-i", vid_file.name, "-vn", "-acodec", "pcm_s16le", "-ar",
            "16000", "-ac", "1", wav_file_name
        ])
        if not result:
            close(wav_file)
            log.error("ffmpeg failed. Bailing.")
            exit(1)

        fs, audio = wav.read(wav_file_name)
        close(wav_file)

    log.info("Will write VTT to %s" % options.output)
    # Make sure the WAV is to code...
    log.info("Loading up WAV file...")

    if fs != 16000:
        log.error("Only 16000hz WAV files are usable.")
        exit(1)

    total_samples = len(audio)
    duration_hours, duration_minutes, duration_seconds = sample_index_to_time(
        len(audio))
    log.info("Approximate duration: %d:%02d:%02d" %
             (duration_hours, duration_minutes, duration_seconds))

    # Let's load up DeepSpeech and get it ready.
    log.info("Loading pre-trained DeepSpeech model...")
    root_model_dir = path.join(options.deepspeech_model_dir, MODEL_DIR)

    model = path.join(root_model_dir, MODEL_FILE)
    alphabet = path.join(root_model_dir, MODEL_ALPHABET)
    lang_model = path.join(root_model_dir, MODEL_LANG_MODEL)
    trie = path.join(root_model_dir, MODEL_TRIE)

    deepspeech = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
    log.info("Done loading model.")

    log.info("Loading language model...")
    deepspeech.enableDecoderWithLM(alphabet, lang_model, trie, LM_WEIGHT,
                                   WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
    log.info("Done loading model.")

    playhead = 0

    out = WebVTTFile()

    bar = ProgressBar(max_value=total_samples)
    while playhead < (total_samples - 1):
        end_point = min(playhead + AUDIO_SEGMENT_SAMPLES, (total_samples - 1))
        segment = audio[playhead:end_point]
        inference = deepspeech.stt(segment, fs)
        log.debug("Inferred: %s" % inference)

        start_hours, start_minutes, start_seconds = sample_index_to_time(
            playhead)
        playhead = end_point
        end_hours, end_minutes, end_seconds = sample_index_to_time(playhead)

        if not inference or inference == "ah":
            continue

        for search, replace in INFERENCE_REPLACEMENTS.iteritems():
            inference = sub(r"\b" + search + r"\b", replace, inference)

        inference = fill(inference, width=MAX_CAPTION_WIDTH)

        start = WebVTTTime(start_hours, start_minutes, start_seconds)
        end = WebVTTTime(end_hours, end_minutes, end_seconds)

        item = WebVTTItem(0, start, end, inference)
        out.append(item)
        bar.update(playhead)

        out.save(options.output, encoding="utf-8")

    out.clean_indexes()
    out.save(options.output, encoding="utf-8")
Example #8
0
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)

    #Settings default values
    delta = WebVTTTime(milliseconds=500)
    encoding = "utf_8"
    #-

    if len(args) <> 3:
        usage()
        sys.exit(2)

    for o, a in opts:
        if o in ("-d", "--delta"):
            delta = WebVTTTime(milliseconds=int(a))
        elif o in ("-e", "--encoding"):
            encoding = a
        elif o in ("-h", "--help"):
            usage()
            sys.exit()

    subs_a = WebVTTFile.open(args[0], encoding=encoding)
    subs_b = WebVTTFile.open(args[1], encoding=encoding)
    out = merge_subtitle(subs_a, subs_b, delta)
    out.save(args[2], encoding=encoding)


if __name__ == "__main__":
    main()
Example #9
0
 def test_shift(self):
     vtt_file = WebVTTFile([WebVTTItem()])
     vtt_file.shift(1, 1, 1, 1)
     self.assertEqual(vtt_file[0].end, (1, 1, 1, 1))
     vtt_file.shift(ratio=2)
     self.assertEqual(vtt_file[0].end, (2, 2, 2, 2))
Example #10
0
 def test_default_value(self):
     self.assertEqual(self.file.eol, os.linesep)
     vtt_file = WebVTTFile(eol='\r\n')
     self.assertEqual(vtt_file.eol, '\r\n')
Example #11
0
 def setUp(self):
     self.file = WebVTTFile()
Example #12
0
 def setUp(self):
     self.duck = WebVTTFile()
Example #13
0
 def test_multiple_item(self):
     vtt_file = WebVTTFile([
         WebVTTItem(1, {'seconds': 0}, {'seconds': 3}, 'Hello'),
         WebVTTItem(1, {'seconds': 1}, {'seconds': 2}, 'World !')
     ])
     self.assertEquals(vtt_file.text, 'Hello\nWorld !')
Example #14
0
 def test_single_item(self):
     vtt_file = WebVTTFile(
         [WebVTTItem(1, {'seconds': 1}, {'seconds': 2}, 'Hello')])
     self.assertEquals(vtt_file.text, 'Hello')
Example #15
0
 def test_shift(self):
     vtt_file = WebVTTFile([WebVTTItem()])
     vtt_file.shift(1, 1, 1, 1)
     self.assertEqual(vtt_file[0].end, (1, 1, 1, 1))
     vtt_file.shift(ratio=2)
     self.assertEqual(vtt_file[0].end, (2, 2, 2, 2))
Example #16
0
        opts, args = getopt.getopt(sys.argv[1:], 'hd:e:', ["help", "encoding=", "delta="])
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)

    #Settings default values
    delta = WebVTTTime(milliseconds=500)
    encoding="utf_8"
    #-

    if len(args) <> 3:
        usage()
        sys.exit(2)

    for o, a in opts:
        if o in ("-d", "--delta"):
            delta = WebVTTTime(milliseconds=int(a))
        elif o in ("-e", "--encoding"):
            encoding = a
        elif o in ("-h", "--help"):
            usage()
            sys.exit()

    subs_a = WebVTTFile.open(args[0], encoding=encoding)
    subs_b = WebVTTFile.open(args[1], encoding=encoding)
    out = merge_subtitle(subs_a, subs_b, delta)
    out.save(args[2], encoding=encoding)

if __name__ == "__main__":
    main()