Ejemplo n.º 1
0
def create_overview_vtt(video_id, nb_img, image, duration, overviewfilename):
    msg = "\ncreate overview vtt file"
    image_width = image["image_width"]
    image_height = image["image_height"]
    image_url = image["image_url"]
    # creating webvtt file
    webvtt = WebVTT()
    for i in range(0, nb_img):
        if nb_img == 99:
            start = format(float(duration * i / 100), '.3f')
            end = format(float(duration * (i + 1) / 100), '.3f')
        else:
            start = format(float(i), '.3f')
            end = format(float(i + 1), '.3f')

        start_time = time.strftime('%H:%M:%S',
                                   time.gmtime(int(str(start).split('.')[0])))
        start_time += ".%s" % (str(start).split('.')[1])
        end_time = time.strftime(
            '%H:%M:%S', time.gmtime(int(
                str(end).split('.')[0]))) + ".%s" % (str(end).split('.')[1])
        caption = Caption(
            '%s' % start_time, '%s' % end_time, '%s#xywh=%d,%d,%d,%d' %
            (image_url, image_width * i, 0, image_width, image_height))
        webvtt.captions.append(caption)
    webvtt.save(overviewfilename)
    if check_file(overviewfilename):
        msg += "\n- overviewfilename :\n%s" % overviewfilename
    else:
        msg = "overviewfilename Wrong file or path : "\
            + "\n%s" % overviewfilename
        add_encoding_log(video_id, msg)
        change_encoding_step(video_id, -1, msg)
        send_email(msg, video_id)
    return msg
Ejemplo n.º 2
0
def download_and_convert_subtitles(path, lang_and_url, c):
    real_subtitles = {}
    for lang in lang_and_url:
        path_lang = os.path.join(path, lang + ".vtt")
        if not os.path.exists(path_lang):
            try:
                subtitle = c.get_page(lang_and_url[lang])
                subtitle = re.sub(r'^0$', '1', str(subtitle), flags=re.M)
                subtitle = html.unescape(subtitle)
                with open(path_lang, 'w') as f:
                    f.write(subtitle)
                if not is_webvtt(path_lang):
                    webvtt = WebVTT().from_srt(path_lang)
                    webvtt.save()
                real_subtitles[lang] = lang + ".vtt"
            except HTTPError as e:
                if e.code == 404 or e.code == 403:
                    logging.error("Fail to get subtitle from {}".format(
                        lang_and_url[lang]))
                    pass
            except Exception as e:
                logging.error("Error when converting subtitle {} : {}".format(
                    lang_and_url[lang], e))
                pass
        else:
            real_subtitles[lang] = lang + ".vtt"
    return real_subtitles
Ejemplo n.º 3
0
def transcribe():
    command = [
        'ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar',
        str(sample_rate), '-ac', '1', '-f', 's16le', '-'
    ]
    process = subprocess.Popen(command, stdout=subprocess.PIPE)

    results = []
    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            results.append(rec.Result())
    results.append(rec.FinalResult())

    vtt = WebVTT()
    for i, res in enumerate(results):
        words = json.loads(res).get('result')
        if not words:
            continue

        start = timeString(words[0]['start'])
        end = timeString(words[-1]['end'])
        content = ' '.join([w['word'] for w in words])

        caption = Caption(start, end, textwrap.fill(content))
        vtt.captions.append(caption)

    # save or return webvtt
    if len(sys.argv) > 2:
        vtt.save(sys.argv[2])
    else:
        print(vtt.content)
def file_writing(path):
    vtt = WebVTT()
    caption = Caption()
    emotion = ""

    for line in webvtt.read('static/subtitle.vtt'):
        emotion = predict(str(line.text))

        if emotion is "joy":
            caption = Caption(
                line.start, line.end,
                "<c.green> " + emotion + ": " + line.text + "</c>")
        elif emotion is "fear":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")

        elif emotion is "anger":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "sadness":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "neutral":
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        else:
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        vtt.captions.append(caption)
    vtt.save('static/my_captions.vtt')
Ejemplo n.º 5
0
def download_and_convert_subtitles(output_path, subtitles,
                                   instance_connection):
    processed_subtitles = {}
    for lang in subtitles:
        subtitle_file = pathlib.Path(output_path).joinpath(f"{lang}.vtt")
        if not subtitle_file.exists():
            try:
                raw_subtitle = instance_connection.get_page(subtitles[lang])
                if not raw_subtitle:
                    logger.error(
                        f"Subtitle fetch failed from {subtitles[lang]}")
                    continue
                subtitle = html.unescape(
                    re.sub(r"^0$", "1", str(raw_subtitle), flags=re.M))
                with open(subtitle_file, "w") as sub_file:
                    sub_file.write(subtitle)
                if not is_webvtt(subtitle_file):
                    webvtt = WebVTT().from_srt(subtitle_file)
                    webvtt.save()
                processed_subtitles[lang] = f"{lang}.vtt"
            except Exception as exc:
                logger.error(
                    f"Error while converting subtitle {subtitles[lang]} : {exc}"
                )
        else:
            processed_subtitles[lang] = f"{lang}.vtt"
    return processed_subtitles
Ejemplo n.º 6
0
class SRTCaptionsTestCase(unittest.TestCase):

    def setUp(self):
        self.webvtt = WebVTT()
        self.srtcaptions = SRTCaptions()

        os.makedirs(OUTPUT_DIR)

    def _get_file(self, filename):
        return os.path.join(SUBTITLES_DIR, filename)

    def tearDown(self):
        if os.path.exists(OUTPUT_DIR):
            rmtree(OUTPUT_DIR)

    def test_convert_from_srt_to_vtt_and_back_gives_same_file(self):
        copy(self._get_file('sample.srt'), OUTPUT_DIR)

        self.webvtt.from_srt(os.path.join(OUTPUT_DIR, 'sample.srt'))
        self.webvtt.save()

        self.srtcaptions.from_vtt(os.path.join(OUTPUT_DIR, 'sample.vtt'))
        self.srtcaptions.save(os.path.join(OUTPUT_DIR, 'sample_converted.srt'))

        with open(os.path.join(OUTPUT_DIR, 'sample.srt'), 'r', encoding='utf-8') as f:
            original = f.read()

        with open(os.path.join(OUTPUT_DIR, 'sample_converted.srt'), 'r', encoding='utf-8') as f:
            converted = f.read()

        self.assertEqual(original.strip(), converted.strip())
Ejemplo n.º 7
0
 def test_srt_empty_gets_removed(self):
     webvtt = WebVTT(parse_options={'ignore_empty_captions': True})
     captions = webvtt.from_srt(
         self._get_file('empty_caption_text.srt')).captions
     for caption in captions:
         self.assertNotEqual(len(caption.lines), 0)
         for line in caption.lines:
             self.assertNotEqual(line, "")
             self.assertIsNotNone(line)
Ejemplo n.º 8
0
def generate_vtt_file(all_preds, logits, save_path):
    vtt = WebVTT()
    predictions = all_preds

    labels, starts, ends = get_labels_start_end_time(predictions, [1])

    # smaller boundaries
    for ix in range(len(labels)):
        if ix == len(labels)-1:
            break
        diff = starts[ix+1]-ends[ix]
        starts[ix+1] -= floor(diff/2)
        ends[ix] += floor(diff/2)

    # load i3d classes
    i3d_scores = logits
    with open('data/info/bslcp/info.pkl', 'rb') as f:
        info_data = pickle.load(f)

    # for start, end in zip(starts, ends):
    for start, end in zip(starts, ends):

        if logits is not None:
            i3d_score = np.sum(np.asarray(i3d_scores)[start:end], axis=0)
            ind = np.argpartition(i3d_score, -10)[-10:]       
            ind = ind[np.argsort(-i3d_score[ind])]
            classes = [info_data['words'][ix] for ix in ind]

            class_str = ','.join(classes)
        else:
            class_str = ''

        start = (start + 8) / 25
        end = (end + 8) / 25

        start_dt = datetime.timedelta(seconds=start)
        start_str = str(start_dt)
        if '.' not in start_str:
            start_str = f'{start_str}.000000'

        end_dt = datetime.timedelta(seconds=end)
        end_str = str(end_dt)
        if '.' not in end_str:
            end_str = f'{end_str}.000000'
        # creating a caption with a list of lines
        caption = Caption(
            start_str,
            end_str,
            [class_str]
        )

        # adding a caption
        vtt.captions.append(caption)


    # save to a different file
    vtt.save(f'{save_path}/demo.vtt')
Ejemplo n.º 9
0
    def __init__(self, suffix='', clear=True, vtt_reader=None):
        self.__suffix = suffix
        self.__clear = clear

        self.__out_ext = '.srt'
        self.__in_ext = '.vtt'

        self.__vtt_reader = vtt_reader
        if self.__vtt_reader is None:
            self.__vtt_reader = WebVTT()
Ejemplo n.º 10
0
class SubtitleWrapper:
    def __init__(self):
        self.vtt = WebVTT()

    def write_caption(self, start, end, line):
        caption = Caption(start, end, line)
        self.vtt.captions.append(caption)

    def save_caption(self, path):
        self.vtt.save('{}.vtt'.format(path))

    def read_caption(self, vtt_file):
        return WebVTT.read(vtt_file)
Ejemplo n.º 11
0
def enrichment_to_vtt(list_enrichment, video):
    webvtt = WebVTT()
    for enrich in list_enrichment:
        start = datetime.datetime.utcfromtimestamp(
            enrich.start).strftime("%H:%M:%S.%f")[:-3]
        end = datetime.datetime.utcfromtimestamp(
            enrich.end).strftime("%H:%M:%S.%f")[:-3]
        url = enrichment_to_vtt_type(enrich)
        caption = Caption(
            "{0}".format(start),
            "{0}".format(end),
            [
                "{",
                '"title": "{0}",'.format(enrich.title),
                '"type": "{0}",'.format(enrich.type),
                '"stop_video": "{0}",'.format("%s" %
                                              1 if enrich.stop_video else 0),
                '"url": "{0}"'.format(url),
                "}",
            ],
        )
        caption.identifier = enrich.slug
        webvtt.captions.append(caption)
    temp_vtt_file = NamedTemporaryFile(suffix=".vtt")
    with open(temp_vtt_file.name, "w") as f:
        webvtt.write(f)
    if FILEPICKER:
        videodir, created = UserFolder.objects.get_or_create(name="%s" %
                                                             video.slug,
                                                             owner=video.owner)
        previousEnrichmentFile = CustomFileModel.objects.filter(
            name__startswith="enrichment",
            folder=videodir,
            created_by=video.owner,
        )
        for enr in previousEnrichmentFile:
            enr.delete()  # do it like this to delete file
        enrichmentFile, created = CustomFileModel.objects.get_or_create(
            name="enrichment", folder=videodir, created_by=video.owner)

        if enrichmentFile.file and os.path.isfile(enrichmentFile.file.path):
            os.remove(enrichmentFile.file.path)
    else:
        enrichmentFile, created = CustomFileModel.objects.get_or_create()
    enrichmentFile.file.save("enrichment.vtt", File(temp_vtt_file))
    enrichmentVtt, created = EnrichmentVtt.objects.get_or_create(video=video)
    enrichmentVtt.src = enrichmentFile
    enrichmentVtt.save()
    return enrichmentFile.file.path
Ejemplo n.º 12
0
    def get(self, request, *args, **kwargs):
        feed = self.get_object()

        try:
            stream = get_object_or_404(feed.streams.all(), uuid=request.GET["stream"])
        except KeyError:
            return HttpResponseBadRequest(_("Bad request"))

        webvtt = WebVTT()
        resp = HttpResponse(content_type="text/vtt; charset=utf-8")

        try:
            start = parse_datetime(request.GET["start"])
            end = parse_datetime(request.GET["end"])
            epoch = parse_datetime(request.GET["epoch"])
        except KeyError:
            return HttpResponseBadRequest(_("Bad request"))

        if stream.program_date_time:
            start_diff = start - stream.started_at
            end_diff = end - stream.started_at
            start = stream.program_date_time + start_diff
            end = stream.program_date_time + end_diff
            epoch = stream.program_date_time

        start = start - timedelta(seconds=5)
        end = end + timedelta(seconds=5)

        items = feed.items.filter(starts_at__gte=start, ends_at__lt=end).order_by(
            "starts_at"
        )
        for item in items:
            start_timecode = self.get_vtt_timecode(epoch, item.starts_at)
            end_timecode = self.get_vtt_timecode(epoch, item.ends_at)
            data = {
                "uuid": item.uuid,
                "starts_at": item.starts_at.isoformat(),
                "ends_at": item.ends_at.isoformat(),
                "start_timecode": start_timecode,
                "end_timecode": end_timecode,
                "payload": item.payload,
            }
            cap = Caption(
                start_timecode, end_timecode, [json.dumps(data, cls=DjangoJSONEncoder)]
            )
            webvtt.captions.append(cap)

        webvtt.write(resp)
        return resp
Ejemplo n.º 13
0
def readVtt(input_file, output_file, input_language, output_language):
    webvtt = WebVTT().read(input_file)
    for sentence in webvtt:
        print(sentence.text)
        translateSentence = translate(sentence.text, input_language,
                                      output_language)
        if both_language:
            sentence.text = sentence.text + " (" + translateSentence + ")"
        else:
            sentence.text = translateSentence
        print(sentence.text)
    webvtt.save()
    os.rename(input_file, input_file + ".old")
    os.rename(input_file.replace(".srt", ".vtt"), input_file)
    print(">", input_file, "saved!")
Ejemplo n.º 14
0
    def __init__(self, title, id, filename, alias=None, tags=None):
        self.title = title
        self.title_lower = title.lower()
        self.alias = alias.lower() if alias else None
        self.tags = tags or []
        self.id = id
        self.captions = []
        
        # Use WebVTT to read the captions file and parse its contents
        for cap in WebVTT().read(filename):
            startsec, startmsec = timecode_to_sms(cap.start)
            endsec, endmsec = timecode_to_sms(cap.end)

            # Clean up the messy captions:
            # Step 1: Ignore "captions" that only stay visible for <50 milliseconds
            if self.captions and (endsec*1000 + endmsec) - (startsec*1000 + startmsec) < 50:
                continue

            # Step 2: strip
            text = cap.text.strip()

            # Step 3: Remove the previous caption piggybacking on the start of the next caption
            if self.captions:
                prevtext = self.captions[-1].text
                # I think this check always passes, but it's there for prudence's sake.
                if prevtext == text[:len(prevtext)]:
                    text = text[len(prevtext):].strip()

            # print(str(startsec) + ' : ' + text)
            self.captions.append(Video.Cap(startsec, text))

        self.write()
Ejemplo n.º 15
0
def write_to_subtitles(data: Union[List[dict], List[List[dict]]]):
    """Returns WebVTT object from data.

    Args:
        data (Union[List[dict], List[List[dict]]]):
            data must be either a 'word'-type tier with
            a list of dicts that have keys for 'start', 'end' and
           'text'. Or a 'sentence'-type tier with a list of lists of dicts.

    Returns:
        WebVTT: WebVTT subtitles
    """
    vtt = WebVTT()
    for caption in data:
        if isinstance(caption, list):
            formatted = Caption(
                float_to_timedelta(caption[0]["start"]),
                float_to_timedelta(caption[-1]["end"]),
                " ".join([w["text"] for w in caption]),
            )
        else:
            formatted = Caption(
                float_to_timedelta(caption["start"]),
                float_to_timedelta(caption["end"]),
                caption["text"],
            )
        vtt.captions.append(formatted)
    return vtt
Ejemplo n.º 16
0
def convertVTTtoTXT(filename):

	file = open(filename + '.txt', 'w')
	for caption in WebVTT().read(filename+'.en.vtt'):
		file.write(caption.text)
	file.close()
	print('subtitle generated for '+ filename)
Ejemplo n.º 17
0
 def __write_srt(fd_srt, path):
     index = 0
     for caption in WebVTT().read(path):
         index += 1
         start = srttime.SubRipTime(0, 0, caption.start_in_seconds)
         end = srttime.SubRipTime(0, 0, caption.end_in_seconds)
         item = srtitem.SubRipItem(index, start, end, html.unescape(caption.text))
         fd_srt.write("%s\n" % str(item))
Ejemplo n.º 18
0
def download_and_convert_subtitles(path, transcripts_data, already_in_vtt,
                                   headers):
    for lang in transcripts_data:
        path_lang = os.path.join(path, lang + ".vtt")
        try:
            subtitle = get_page(transcripts_data[lang],
                                headers).decode('utf-8')
            with open(path_lang, 'w') as f:
                f.write(str(subtitle))
            if not already_in_vtt:
                exec_cmd(
                    "sed -i 's/^0$/1/' " + path_lang
                )  #This little hack is use because WebVTT.from_srt check is the first line is 1
                webvtt = WebVTT().from_srt(path_lang)
                webvtt.save()
        except HTTPError as e:
            if e.code == 404 or e.code == 403:
                pass
Ejemplo n.º 19
0
def transcode(source):
    try:
        sourceDir = os.path.dirname(source)
        sourceFile = os.path.basename(source)
        sourceFileNoExt = os.path.splitext(sourceFile)[0]
        targetFile = sourceFileNoExt + '.vtt'
        convert_ending(source)
        clean_file(source)
        targetFull = sourceDir + '/' + targetFile
        logging.debug(prelog + 'targetFull: ' + targetFull)
        webvtt = WebVTT().from_srt(source)
        webvtt.save(targetFull)
        return targetFull
    except:
        logging.exception(prelog)
        if os.path.isfile(source):
            shutil.move(source, source + '.failed')
        pass
    return None
Ejemplo n.º 20
0
def combine(vtt_filename):
    line = ""
    for caption in WebVTT().read(vtt_filename):
        #print(caption.start)
        new_texts = caption.text.replace("  ", " ").split('\n')
        for new_text in new_texts:
            if (line.find(new_text) == -1):
                line = line + " " + new_text
    #print(line)
    return line.replace("  ", " ")
Ejemplo n.º 21
0
def store_remote_transcripting_video(video_id):
    #
    msg = ""
    video_to_encode = Video.objects.get(id=video_id)
    output_dir = create_outputdir(video_id, video_to_encode.video.path)
    info_video = {}

    if check_file(output_dir + "/transcript.json"):
        with open(output_dir + "/transcript.json") as json_file:
            info_video = json.load(json_file)

        print_if_debug(output_dir)
        print_if_debug(json.dumps(info_video, indent=2))

        webvtt = WebVTT()
        # They're sorted by confidence. First one is highest confidence result.
        words = info_video["transcripts"][0]["words"]
        """
        for transcript in info_video["transcripts"]:
            for word in transcript["words"]:
                words.append(word)
        """
        text_caption = []
        start_caption = None
        duration = 0
        for word in words:
            text_caption.append(word['word'])
            if start_caption is None:
                start_caption = word['start_time']
            if duration + word['duration'] > SENTENCE_MAX_LENGTH:
                caption = Caption(
                    format_time_caption(start_caption),
                    format_time_caption(start_caption + duration +
                                        word['duration']),
                    " ".join(text_caption))
                webvtt.captions.append(caption)
                text_caption = []
                start_caption = None
                duration = 0
            else:
                duration += word['duration']
        print_if_debug(webvtt)
        msg += saveVTT(video_to_encode, webvtt)
        add_encoding_log(video_id, msg)
        change_encoding_step(video_id, 0, "done")
        # envois mail fin transcription
        if EMAIL_ON_TRANSCRIPTING_COMPLETION:
            send_email_transcript(video_to_encode)

    else:
        msg += "Wrong file or path : "\
            + "\n%s" % video_to_encode.video.path
        add_encoding_log(video_id, msg)
        change_encoding_step(video_id, -1, msg)
        send_email(msg, video_id)
Ejemplo n.º 22
0
def readSrt(input_file, output_file, input_language, output_language):
    print('processing file', input_file)
    subs = SubRipFile.open(input_file)
    print(">", "read file", input_file)
    for sentence in subs:
        print(sentence.text)
        translateSentence = translate(sentence.text, input_language,
                                      output_language)
        if both_language:
            sentence.text = sentence.text + " (" + translateSentence + ")"
        else:
            sentence.text = translateSentence
        print(sentence.text)
    subs.save(output_file, 'utf-8')
    webvtt = WebVTT().from_srt(output_file)
    webvtt.save()
    os.rename(input_file, input_file + ".old")
    os.remove(output_file)
    os.rename(output_file.replace(".srt", ".vtt"), input_file)
    print(">", output_file, "saved!")
Ejemplo n.º 23
0
    def load_auto_subtitle_data(self, vid):
        lang = my_config.LANG
        postfix_in_filename = '-'+lang+'-auto.vtt'
        file_list = glob.glob(my_config.SUBTITLE_PATH + '/*' + vid + postfix_in_filename)
        if len(file_list) > 1:
            print('more than one subtitle. check this.', file_list)
            self.subtitle = None
            assert False
        if len(file_list) == 1:
            for i, subtitle_chunk in enumerate(WebVTT().read(file_list[0])):
                raw_subtitle = str(subtitle_chunk.raw_text)
                if raw_subtitle.find('\n'):
                    raw_subtitle = raw_subtitle.split('\n')

                for raw_subtitle_chunk in raw_subtitle:
                    if self.TIMESTAMP_PATTERN.search(raw_subtitle_chunk) is None:
                        continue

                    # removes html tags and timing tags from caption text
                    raw_subtitle_chunk = raw_subtitle_chunk.replace("</c>", "")
                    raw_subtitle_chunk = re.sub("<c[.]\w+>", '', raw_subtitle_chunk)

                    word_list = []
                    raw_subtitle_s = subtitle_chunk.start_in_seconds
                    raw_subtitle_e = subtitle_chunk.end_in_seconds

                    word_chunk = raw_subtitle_chunk.split('<c>')

                    for i, word in enumerate(word_chunk):
                        word_info = {}

                        if i == len(word_chunk)-1:
                            word_info['word'] = word
                            word_info['start'] = word_list[i-1]['end']
                            word_info['end'] = raw_subtitle_e
                            word_list.append(word_info)
                            break

                        word = word.split("<")
                        word_info['word'] = word[0]
                        word_info['end'] = self.get_seconds(word[1][:-1])

                        if i == 0:
                            word_info['start'] = raw_subtitle_s
                            word_list.append(word_info)
                            continue

                        word_info['start'] = word_list[i-1]['end']
                        word_list.append(word_info)

                    self.subtitle.extend(word_list)
        else:
            print('subtitle file is not exist')
            self.subtitle = None
Ejemplo n.º 24
0
    def translate(self):
        newVTT = WebVTT()
        fileName = self.fileNameWOType + '.vtt'
        for caption in webvtt.read(fileName):
            #            print(caption.start)
            #            print(caption.end)
            #            print(caption.text)
            translation = Translate.AWSTranslate.translate_text(
                Text=caption.text,
                SourceLanguageCode=self.sourceLanguage,
                TargetLanguageCode=self.targetLanguage)

            newCaption = Caption(caption.start, caption.end,
                                 translation.get('TranslatedText'))
            newCaption.identifier = caption.identifier
            newVTT.captions.append(newCaption)

        translatedFileName = self.fileNameWOType + '_' + self.targetLanguage + '.vtt'
        newVTT.save(translatedFileName)
        return 1
Ejemplo n.º 25
0
def vtt2json(vttfile):
    t_start_milli = []
    t_end_milli = []
    text = []
    for caption in WebVTT().read(vttfile):
        h, m, s, ms = re.split(r'[\.:]+', caption.start)
        t_start_milli.append(h * 3600 * 1000 + m * 60 * 1000 + s * 1000 + ms)
        h, m, s, ms = re.split(r'[\.:]+', caption.end)
        t_end_milli.append(h * 3600 * 1000 + m * 60 * 1000 + s * 1000 + ms)
        text.append(caption.text)
    dict_obj = dict({"start": t_start_milli, "end": t_end_milli, "text": text})
    return dict_obj
Ejemplo n.º 26
0
def read_file(file_name):
    """ Reads an SRT file """

    data = WebVTT.from_srt(file_name)
    captions = data.captions

    segments = []
    for caption in captions:
        seg = read_caption(caption)
        if seg is not None:
            segments.append(seg)

    return segments
Ejemplo n.º 27
0
def convert(directory, filename):
    index = 0
    vtt_filepath = f"%s\\%s.vtt" % (directory, filename)
    srt_filepath = f"%s\\%s.srt" % (directory, filename)
    srt = open(srt_filepath, "w")

    for caption in WebVTT().read(vtt_filepath):
        index += 1
        start = SubRipTime(0, 0, caption.start_in_seconds)
        end = SubRipTime(0, 0, caption.end_in_seconds)
        srt.write(
            SubRipItem(index, start, end, html.unescape(
                caption.text)).__str__() + "\n")
Ejemplo n.º 28
0
def process_webvtt(webvtt_file, video_id, output_file):
    text_file = open(output_file, 'w')
    for caption in WebVTT().read(webvtt_file):
        text_file.writelines('[' + caption.text + ']')
        text_file.writelines('(' + caption_utility.youtube_watch_url +
                             video_id + '#t=' + caption.start[0:2] + 'h' +
                             caption.start[3:5] + 'm' + caption.start[6:8] +
                             's' + ')')
        text_file.writelines('\n')

        text_file.writelines(caption.start.split('.')[0])
        text_file.writelines('\n')
        text_file.writelines('\n')
    text_file.close()
Ejemplo n.º 29
0
 def get_subtitle(self):
     subtitle = []
     sub_list = glob.glob('{}/{}.vtt'.format(self.vid_path, self.vid_name))
     if len(sub_list) > 1:
         print('[WARN] There are more than one subtitle.')
         assert False
     if len(sub_list) == 1:
         # check wrong subtitle and rewrite vtt files
         try:
             sub_inst = WebVTT().read(sub_list[0])
         except:
             self.do_check(sub_list[0])
             sub_inst = WebVTT().read(sub_list[0])
         # iterate subtitle instance
         for i, sub_chunk in enumerate(sub_inst):
             raw_sub = str(sub_chunk.raw_text)
             if raw_sub.find('\n'):
                 raw_sub = raw_sub.split('\n')
             else:
                 raw_sub = [raw_sub]
             sub_info = {}
             sent = ''
             for words_chunk in raw_sub:
                 words = re.sub(r"[-\".]", '', words_chunk).split(' ')
                 for word in words:
                     sent += word
                     sent += ' '
                 sub_info['sent'] = sent.strip(' ')
                 sub_info['start'] = sub_chunk.start_in_seconds
                 sub_info['end'] = sub_chunk.end_in_seconds
             subtitle.append(sub_info)
         return subtitle
     else:
         print('[ERROR] There is no subtitle file for {} video.'.format(
             self.vid_name))
         return None
Ejemplo n.º 30
0
def read_from_existing_vtt(bucket_name, file_name):
    vtt = WebVTT()
    blob = read_data_from_storage(bucket_name, file_name)
    blob = [
        string for string in blob.decode("utf-8").split('\n')[2:] if string
    ]
    start, end = '', ''
    for string in blob:
        if '-->' in string:
            start, end = string.split(' --> ')
        else:
            caption = Caption(start, end, string)
            vtt.captions.append(caption)

    return vtt