Example #1
0
def download_and_convert_subtitles(path, lang_and_url, c):
    real_subtitles = {}
    for lang in lang_and_url:
        path_lang = os.path.join(path, lang + ".vtt")
        if not os.path.exists(path_lang):
            try:
                subtitle = c.get_page(lang_and_url[lang])
                subtitle = re.sub(r'^0$', '1', str(subtitle), flags=re.M)
                subtitle = html.unescape(subtitle)
                with open(path_lang, 'w') as f:
                    f.write(subtitle)
                if not is_webvtt(path_lang):
                    webvtt = WebVTT().from_srt(path_lang)
                    webvtt.save()
                real_subtitles[lang] = lang + ".vtt"
            except HTTPError as e:
                if e.code == 404 or e.code == 403:
                    logging.error("Fail to get subtitle from {}".format(
                        lang_and_url[lang]))
                    pass
            except Exception as e:
                logging.error("Error when converting subtitle {} : {}".format(
                    lang_and_url[lang], e))
                pass
        else:
            real_subtitles[lang] = lang + ".vtt"
    return real_subtitles
Example #2
0
class SRTCaptionsTestCase(unittest.TestCase):

    def setUp(self):
        self.webvtt = WebVTT()
        self.srtcaptions = SRTCaptions()

        os.makedirs(OUTPUT_DIR)

    def _get_file(self, filename):
        return os.path.join(SUBTITLES_DIR, filename)

    def tearDown(self):
        if os.path.exists(OUTPUT_DIR):
            rmtree(OUTPUT_DIR)

    def test_convert_from_srt_to_vtt_and_back_gives_same_file(self):
        copy(self._get_file('sample.srt'), OUTPUT_DIR)

        self.webvtt.from_srt(os.path.join(OUTPUT_DIR, 'sample.srt'))
        self.webvtt.save()

        self.srtcaptions.from_vtt(os.path.join(OUTPUT_DIR, 'sample.vtt'))
        self.srtcaptions.save(os.path.join(OUTPUT_DIR, 'sample_converted.srt'))

        with open(os.path.join(OUTPUT_DIR, 'sample.srt'), 'r', encoding='utf-8') as f:
            original = f.read()

        with open(os.path.join(OUTPUT_DIR, 'sample_converted.srt'), 'r', encoding='utf-8') as f:
            converted = f.read()

        self.assertEqual(original.strip(), converted.strip())
def file_writing(path):
    vtt = WebVTT()
    caption = Caption()
    emotion = ""

    for line in webvtt.read('static/subtitle.vtt'):
        emotion = predict(str(line.text))

        if emotion is "joy":
            caption = Caption(
                line.start, line.end,
                "<c.green> " + emotion + ": " + line.text + "</c>")
        elif emotion is "fear":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")

        elif emotion is "anger":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "sadness":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "neutral":
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        else:
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        vtt.captions.append(caption)
    vtt.save('static/my_captions.vtt')
Example #4
0
def create_overview_vtt(video_id, nb_img, image, duration, overviewfilename):
    msg = "\ncreate overview vtt file"
    image_width = image["image_width"]
    image_height = image["image_height"]
    image_url = image["image_url"]
    # creating webvtt file
    webvtt = WebVTT()
    for i in range(0, nb_img):
        if nb_img == 99:
            start = format(float(duration * i / 100), '.3f')
            end = format(float(duration * (i + 1) / 100), '.3f')
        else:
            start = format(float(i), '.3f')
            end = format(float(i + 1), '.3f')

        start_time = time.strftime('%H:%M:%S',
                                   time.gmtime(int(str(start).split('.')[0])))
        start_time += ".%s" % (str(start).split('.')[1])
        end_time = time.strftime(
            '%H:%M:%S', time.gmtime(int(
                str(end).split('.')[0]))) + ".%s" % (str(end).split('.')[1])
        caption = Caption(
            '%s' % start_time, '%s' % end_time, '%s#xywh=%d,%d,%d,%d' %
            (image_url, image_width * i, 0, image_width, image_height))
        webvtt.captions.append(caption)
    webvtt.save(overviewfilename)
    if check_file(overviewfilename):
        msg += "\n- overviewfilename :\n%s" % overviewfilename
    else:
        msg = "overviewfilename Wrong file or path : "\
            + "\n%s" % overviewfilename
        add_encoding_log(video_id, msg)
        change_encoding_step(video_id, -1, msg)
        send_email(msg, video_id)
    return msg
Example #5
0
def download_and_convert_subtitles(output_path, subtitles,
                                   instance_connection):
    processed_subtitles = {}
    for lang in subtitles:
        subtitle_file = pathlib.Path(output_path).joinpath(f"{lang}.vtt")
        if not subtitle_file.exists():
            try:
                raw_subtitle = instance_connection.get_page(subtitles[lang])
                if not raw_subtitle:
                    logger.error(
                        f"Subtitle fetch failed from {subtitles[lang]}")
                    continue
                subtitle = html.unescape(
                    re.sub(r"^0$", "1", str(raw_subtitle), flags=re.M))
                with open(subtitle_file, "w") as sub_file:
                    sub_file.write(subtitle)
                if not is_webvtt(subtitle_file):
                    webvtt = WebVTT().from_srt(subtitle_file)
                    webvtt.save()
                processed_subtitles[lang] = f"{lang}.vtt"
            except Exception as exc:
                logger.error(
                    f"Error while converting subtitle {subtitles[lang]} : {exc}"
                )
        else:
            processed_subtitles[lang] = f"{lang}.vtt"
    return processed_subtitles
Example #6
0
def transcribe():
    command = [
        'ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar',
        str(sample_rate), '-ac', '1', '-f', 's16le', '-'
    ]
    process = subprocess.Popen(command, stdout=subprocess.PIPE)

    results = []
    while True:
        data = process.stdout.read(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            results.append(rec.Result())
    results.append(rec.FinalResult())

    vtt = WebVTT()
    for i, res in enumerate(results):
        words = json.loads(res).get('result')
        if not words:
            continue

        start = timeString(words[0]['start'])
        end = timeString(words[-1]['end'])
        content = ' '.join([w['word'] for w in words])

        caption = Caption(start, end, textwrap.fill(content))
        vtt.captions.append(caption)

    # save or return webvtt
    if len(sys.argv) > 2:
        vtt.save(sys.argv[2])
    else:
        print(vtt.content)
Example #7
0
def generate_vtt_file(all_preds, logits, save_path):
    vtt = WebVTT()
    predictions = all_preds

    labels, starts, ends = get_labels_start_end_time(predictions, [1])

    # smaller boundaries
    for ix in range(len(labels)):
        if ix == len(labels)-1:
            break
        diff = starts[ix+1]-ends[ix]
        starts[ix+1] -= floor(diff/2)
        ends[ix] += floor(diff/2)

    # load i3d classes
    i3d_scores = logits
    with open('data/info/bslcp/info.pkl', 'rb') as f:
        info_data = pickle.load(f)

    # for start, end in zip(starts, ends):
    for start, end in zip(starts, ends):

        if logits is not None:
            i3d_score = np.sum(np.asarray(i3d_scores)[start:end], axis=0)
            ind = np.argpartition(i3d_score, -10)[-10:]       
            ind = ind[np.argsort(-i3d_score[ind])]
            classes = [info_data['words'][ix] for ix in ind]

            class_str = ','.join(classes)
        else:
            class_str = ''

        start = (start + 8) / 25
        end = (end + 8) / 25

        start_dt = datetime.timedelta(seconds=start)
        start_str = str(start_dt)
        if '.' not in start_str:
            start_str = f'{start_str}.000000'

        end_dt = datetime.timedelta(seconds=end)
        end_str = str(end_dt)
        if '.' not in end_str:
            end_str = f'{end_str}.000000'
        # creating a caption with a list of lines
        caption = Caption(
            start_str,
            end_str,
            [class_str]
        )

        # adding a caption
        vtt.captions.append(caption)


    # save to a different file
    vtt.save(f'{save_path}/demo.vtt')
Example #8
0
class SubtitleWrapper:
    def __init__(self):
        self.vtt = WebVTT()

    def write_caption(self, start, end, line):
        caption = Caption(start, end, line)
        self.vtt.captions.append(caption)

    def save_caption(self, path):
        self.vtt.save('{}.vtt'.format(path))

    def read_caption(self, vtt_file):
        return WebVTT.read(vtt_file)
def readVtt(input_file, output_file, input_language, output_language):
    webvtt = WebVTT().read(input_file)
    for sentence in webvtt:
        print(sentence.text)
        translateSentence = translate(sentence.text, input_language,
                                      output_language)
        if both_language:
            sentence.text = sentence.text + " (" + translateSentence + ")"
        else:
            sentence.text = translateSentence
        print(sentence.text)
    webvtt.save()
    os.rename(input_file, input_file + ".old")
    os.rename(input_file.replace(".srt", ".vtt"), input_file)
    print(">", input_file, "saved!")
Example #10
0
def download_and_convert_subtitles(path, transcripts_data, already_in_vtt,
                                   headers):
    for lang in transcripts_data:
        path_lang = os.path.join(path, lang + ".vtt")
        try:
            subtitle = get_page(transcripts_data[lang],
                                headers).decode('utf-8')
            with open(path_lang, 'w') as f:
                f.write(str(subtitle))
            if not already_in_vtt:
                exec_cmd(
                    "sed -i 's/^0$/1/' " + path_lang
                )  #This little hack is use because WebVTT.from_srt check is the first line is 1
                webvtt = WebVTT().from_srt(path_lang)
                webvtt.save()
        except HTTPError as e:
            if e.code == 404 or e.code == 403:
                pass
Example #11
0
def transcode(source):
    try:
        sourceDir = os.path.dirname(source)
        sourceFile = os.path.basename(source)
        sourceFileNoExt = os.path.splitext(sourceFile)[0]
        targetFile = sourceFileNoExt + '.vtt'
        convert_ending(source)
        clean_file(source)
        targetFull = sourceDir + '/' + targetFile
        logging.debug(prelog + 'targetFull: ' + targetFull)
        webvtt = WebVTT().from_srt(source)
        webvtt.save(targetFull)
        return targetFull
    except:
        logging.exception(prelog)
        if os.path.isfile(source):
            shutil.move(source, source + '.failed')
        pass
    return None
Example #12
0
def readSrt(input_file, output_file, input_language, output_language):
    print('processing file', input_file)
    subs = SubRipFile.open(input_file)
    print(">", "read file", input_file)
    for sentence in subs:
        print(sentence.text)
        translateSentence = translate(sentence.text, input_language,
                                      output_language)
        if both_language:
            sentence.text = sentence.text + " (" + translateSentence + ")"
        else:
            sentence.text = translateSentence
        print(sentence.text)
    subs.save(output_file, 'utf-8')
    webvtt = WebVTT().from_srt(output_file)
    webvtt.save()
    os.rename(input_file, input_file + ".old")
    os.remove(output_file)
    os.rename(output_file.replace(".srt", ".vtt"), input_file)
    print(">", output_file, "saved!")
    def translate(self):
        newVTT = WebVTT()
        fileName = self.fileNameWOType + '.vtt'
        for caption in webvtt.read(fileName):
            #            print(caption.start)
            #            print(caption.end)
            #            print(caption.text)
            translation = Translate.AWSTranslate.translate_text(
                Text=caption.text,
                SourceLanguageCode=self.sourceLanguage,
                TargetLanguageCode=self.targetLanguage)

            newCaption = Caption(caption.start, caption.end,
                                 translation.get('TranslatedText'))
            newCaption.identifier = caption.identifier
            newVTT.captions.append(newCaption)

        translatedFileName = self.fileNameWOType + '_' + self.targetLanguage + '.vtt'
        newVTT.save(translatedFileName)
        return 1
Example #14
0
def process_video_url(url, pk):
    vid_id = get_youtube_vid_id(url)
    captions = YouTubeTranscriptApi.get_transcript(video_id=vid_id)

    vtt = WebVTT()

    for t in captions:
        start = datetime.timedelta(milliseconds=t["start"] * 1000)
        end = datetime.timedelta(milliseconds=t["duration"] * 1000) + start

        if "." not in str(start):
            start = str(start) + ".000"

        if "." not in str(end):
            end = str(end) + ".000"

        caption = Caption(
            start=str(start),
            end=str(end),
            text=t["text"]
        )

        vtt.captions.append(caption)

    if not os.path.isdir(CACHE):
        os.mkdir(CACHE)

    path = os.path.join(CACHE, f"{vid_id}.vtt")
    vtt.save(path)

    transcript = File(open(path, "rb"))
    os.remove(path)

    obj = VidSpark.management.models.Video.objects.get(pk=pk)
    obj.transcript = transcript
    obj.save()
def text_extract():
    try:

        # creating a folder named data
        if os.path.exists('static/Text'):
            shutil.rmtree('static/Text')

        os.makedirs('static/Text')

    # if not created then raise error
    except OSError:
        print('Error: Creating directory of data')
    vtt_pos = WebVTT()
    vtt_neg = WebVTT()
    vtt_neu = WebVTT()
    caption = Caption()
    emotion = ""

    for line in webvtt.read('static/subtitle.vtt'):
        emotion = predict(str(line.text))

        if emotion == "joy":
            caption = Caption(
                line.start, line.end,
                "<c.green> " + emotion + ": " + line.text + "</c>")
            vtt_pos.captions.append(caption)
        elif emotion == "anger" or emotion == "sadness" or emotion == "fear":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
            vtt_neg.captions.append(caption)
        elif emotion == "neutral":
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
            vtt_neu.captions.append(caption)

    vtt_pos.save('static/Text/positive.vtt')
    vtt_neg.save('static/Text/negative.vtt')
    vtt_neu.save('static/Text/neutral.vtt')
Example #16
0
class WebVTTTestCase(unittest.TestCase):
    def setUp(self):
        self.webvtt = WebVTT()

    def _get_file(self, filename):
        return os.path.join(SUBTITLES_DIR, filename)

    def tearDown(self):
        if os.path.exists(OUTPUT_DIR):
            rmtree(OUTPUT_DIR)

    def test_create_caption(self):
        caption = Caption('00:00:00.500', '00:00:07.000',
                          ['Caption test line 1', 'Caption test line 2'])
        self.assertEqual(caption.start, '00:00:00.500')
        self.assertEqual(caption.start_in_seconds, 0.5)
        self.assertEqual(caption.end, '00:00:07.000')
        self.assertEqual(caption.end_in_seconds, 7)
        self.assertEqual(caption.lines,
                         ['Caption test line 1', 'Caption test line 2'])

    def test_save_captions(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.vtt'), OUTPUT_DIR)

        self.webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))
        new_caption = Caption(
            '00:00:07.000', '00:00:11.890',
            ['New caption text line1', 'New caption text line2'])
        self.webvtt.captions.append(new_caption)
        self.webvtt.save()

        with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1',
            '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1',
            'New caption text line2'
        ]

        self.assertListEqual(lines, expected_lines)

    def test_srt_conversion(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.srt'), OUTPUT_DIR)

        self.webvtt.from_srt(os.path.join(OUTPUT_DIR, 'one_caption.srt'))
        self.webvtt.save()

        self.assertTrue(
            os.path.exists(os.path.join(OUTPUT_DIR, 'one_caption.vtt')))

        with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
        ]

        self.assertListEqual(lines, expected_lines)

    def test_sbv_conversion(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('two_captions.sbv'), OUTPUT_DIR)

        self.webvtt.from_sbv(os.path.join(OUTPUT_DIR, 'two_captions.sbv'))
        self.webvtt.save()

        self.assertTrue(
            os.path.exists(os.path.join(OUTPUT_DIR, 'two_captions.vtt')))

        with open(os.path.join(OUTPUT_DIR, 'two_captions.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.378 --> 00:00:11.378',
            'Caption text #1',
            '',
            '00:00:11.378 --> 00:00:12.305',
            'Caption text #2 (line 1)',
            'Caption text #2 (line 2)',
        ]

        self.assertListEqual(lines, expected_lines)

    def test_save_to_other_location(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)

        self.webvtt.read(self._get_file('one_caption.vtt')).save(target_path)
        self.assertTrue(
            os.path.exists(os.path.join(target_path, 'one_caption.vtt')))

    def test_save_specific_filename(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)
        output_file = os.path.join(target_path, 'custom_name.vtt')

        self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file)
        self.assertTrue(os.path.exists(output_file))

    def test_save_specific_filename_no_extension(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)
        output_file = os.path.join(target_path, 'custom_name')

        self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file)
        self.assertTrue(
            os.path.exists(os.path.join(target_path, 'custom_name.vtt')))

    def test_caption_timestamp_update(self):
        c = Caption('00:00:00.500', '00:00:07.000')
        c.start = '00:00:01.750'
        c.end = '00:00:08.250'

        self.assertEqual(c.start, '00:00:01.750')
        self.assertEqual(c.end, '00:00:08.250')

    def test_caption_text(self):
        c = Caption(text=['Caption line #1', 'Caption line #2'])
        self.assertEqual(c.text, 'Caption line #1\nCaption line #2')

    def test_caption_receive_text(self):
        c = Caption(text='Caption line #1\nCaption line #2')

        self.assertEqual(len(c.lines), 2)
        self.assertEqual(c.text, 'Caption line #1\nCaption line #2')

    def test_supported_formats(self):
        self.assertListEqual(WebVTT().supported_formats(),
                             [sf[0] for sf in SUPPORTED_FORMATS])

    def test_update_text(self):
        c = Caption(text='Caption line #1')
        c.text = 'Caption line #1 updated'
        self.assertEqual(c.text, 'Caption line #1 updated')

    def test_update_text_multiline(self):
        c = Caption(text='Caption line #1')
        c.text = 'Caption line #1\nCaption line #2'

        self.assertEqual(len(c.lines), 2)

        self.assertEqual(c.text, 'Caption line #1\nCaption line #2')

    def test_update_text_wrong_type(self):
        c = Caption(text='Caption line #1')

        self.assertRaises(AttributeError, setattr, c, 'text', 123)

    def test_manipulate_lines(self):
        c = Caption(text=['Caption line #1', 'Caption line #2'])
        c.lines[0] = 'Caption line #1 updated'
        self.assertEqual(c.lines[0], 'Caption line #1 updated')

    def test_captions(self):
        self.webvtt.read(self._get_file('sample.vtt'))
        self.assertIsInstance(self.webvtt.captions, list)

    def test_captions_prevent_write(self):
        self.webvtt.read(self._get_file('sample.vtt'))
        self.assertRaises(AttributeError, setattr, self.webvtt, 'captions', [])

    def test_sequence_iteration(self):
        self.webvtt.read(self._get_file('sample.vtt'))
        self.assertIsInstance(self.webvtt[0], Caption)
        self.assertEqual(len(self.webvtt), len(self.webvtt.captions))

    def test_save_no_filename(self):
        webvtt = WebVTT()
        self.assertRaises(MissingFilenameError, webvtt.save)

    def test_malformed_start_timestamp(self):
        self.assertRaises(MalformedCaptionError, Caption, '01:00')
Example #17
0
# -*- coding: utf-8 -*-
import pysrt
import webvtt
from webvtt import WebVTT, Caption
subs = pysrt.open(
    'Tanmay Bakshi - New Google Employee Indian Boy Going To Ninth Grade.srt',
    encoding='utf-8')
vtt = WebVTT()
for ligne in subs:
    print(str(ligne.start))
    print(str(ligne.end))
    print(str(ligne.text))
    caption = Caption(str(ligne.start), str(ligne.end), str(ligne.text))
    #print(caption.start)
    #print(caption.end)
    #print (var2)
    vtt.captions.append(caption)
vtt.save('_fr.vtt')
vtt = WebVTT()
vtt.read(filename)

stmp = StringIO()
print("<div>", file=stmp)
for caption in vtt:
    print('<span data-start="{}" data-end="{}">{}</span>'.format(
        caption.start, caption.end, caption.text),
          file=stmp)
print("</div>", file=stmp)

# Translate
driver = TranslationDriver(args.lang)
strans = driver.translate(stmp.getvalue())

# Convert translated HTML back to VTT
vtt = WebVTT()

soup = BeautifulSoup(strans, "lxml")
for span in soup.find_all("span"):
    start = span["data-start"]
    end = span["data-end"]
    caption = Caption(start, end, span.text)
    vtt.captions.append(caption)

# Remove the english file
os.remove(filename)

outfile = filename.replace(".en.", ".{}.".format(args.lang))
vtt.save(outfile)
print(green(outfile, bold=True))
Example #19
0
class WebVTTTestCase(unittest.TestCase):
    def setUp(self):
        self.webvtt = WebVTT()

    def _get_file(self, filename):
        return os.path.join(SUBTITLES_DIR, filename)

    def tearDown(self):
        if os.path.exists(OUTPUT_DIR):
            rmtree(OUTPUT_DIR)

    def test_create_caption(self):
        caption = Caption('00:00:00.500', '00:00:07.000',
                          ['Caption test line 1', 'Caption test line 2'])
        self.assertEqual(caption.start, '00:00:00.500')
        self.assertEqual(caption.start_in_seconds, 0.5)
        self.assertEqual(caption.end, '00:00:07.000')
        self.assertEqual(caption.end_in_seconds, 7)
        self.assertEqual(caption.lines,
                         ['Caption test line 1', 'Caption test line 2'])

    def test_write_captions(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.vtt'), OUTPUT_DIR)

        out = io.StringIO()
        self.webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))
        new_caption = Caption(
            '00:00:07.000', '00:00:11.890',
            ['New caption text line1', 'New caption text line2'])
        self.webvtt.captions.append(new_caption)
        self.webvtt.write(out)

        out.seek(0)
        lines = [line.rstrip() for line in out.readlines()]

        expected_lines = [
            'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1',
            '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1',
            'New caption text line2'
        ]

        self.assertListEqual(lines, expected_lines)

    def test_save_captions(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.vtt'), OUTPUT_DIR)

        self.webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))
        new_caption = Caption(
            '00:00:07.000', '00:00:11.890',
            ['New caption text line1', 'New caption text line2'])
        self.webvtt.captions.append(new_caption)
        self.webvtt.save()

        with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1',
            '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1',
            'New caption text line2'
        ]

        self.assertListEqual(lines, expected_lines)

    def test_srt_conversion(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.srt'), OUTPUT_DIR)

        self.webvtt.from_srt(os.path.join(OUTPUT_DIR, 'one_caption.srt'))
        self.webvtt.save()

        self.assertTrue(
            os.path.exists(os.path.join(OUTPUT_DIR, 'one_caption.vtt')))

        with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
        ]

        self.assertListEqual(lines, expected_lines)

    def test_sbv_conversion(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('two_captions.sbv'), OUTPUT_DIR)

        self.webvtt.from_sbv(os.path.join(OUTPUT_DIR, 'two_captions.sbv'))
        self.webvtt.save()

        self.assertTrue(
            os.path.exists(os.path.join(OUTPUT_DIR, 'two_captions.vtt')))

        with open(os.path.join(OUTPUT_DIR, 'two_captions.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.378 --> 00:00:11.378',
            'Caption text #1',
            '',
            '00:00:11.378 --> 00:00:12.305',
            'Caption text #2 (line 1)',
            'Caption text #2 (line 2)',
        ]

        self.assertListEqual(lines, expected_lines)

    def test_save_to_other_location(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)

        self.webvtt.read(self._get_file('one_caption.vtt')).save(target_path)
        self.assertTrue(
            os.path.exists(os.path.join(target_path, 'one_caption.vtt')))

    def test_save_specific_filename(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)
        output_file = os.path.join(target_path, 'custom_name.vtt')

        self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file)
        self.assertTrue(os.path.exists(output_file))

    def test_save_specific_filename_no_extension(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)
        output_file = os.path.join(target_path, 'custom_name')

        self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file)
        self.assertTrue(
            os.path.exists(os.path.join(target_path, 'custom_name.vtt')))

    def test_caption_timestamp_update(self):
        c = Caption('00:00:00.500', '00:00:07.000')
        c.start = '00:00:01.750'
        c.end = '00:00:08.250'

        self.assertEqual(c.start, '00:00:01.750')
        self.assertEqual(c.end, '00:00:08.250')

    def test_caption_timestamp_format(self):
        c = Caption('01:02:03.400', '02:03:04.500')
        self.assertEqual(c.start, '01:02:03.400')
        self.assertEqual(c.end, '02:03:04.500')

        c = Caption('02:03.400', '03:04.500')
        self.assertEqual(c.start, '00:02:03.400')
        self.assertEqual(c.end, '00:03:04.500')

    def test_caption_text(self):
        c = Caption(text=['Caption line #1', 'Caption line #2'])
        self.assertEqual(c.text, 'Caption line #1\nCaption line #2')

    def test_caption_receive_text(self):
        c = Caption(text='Caption line #1\nCaption line #2')

        self.assertEqual(len(c.lines), 2)
        self.assertEqual(c.text, 'Caption line #1\nCaption line #2')

    def test_update_text(self):
        c = Caption(text='Caption line #1')
        c.text = 'Caption line #1 updated'
        self.assertEqual(c.text, 'Caption line #1 updated')

    def test_update_text_multiline(self):
        c = Caption(text='Caption line #1')
        c.text = 'Caption line #1\nCaption line #2'

        self.assertEqual(len(c.lines), 2)

        self.assertEqual(c.text, 'Caption line #1\nCaption line #2')

    def test_update_text_wrong_type(self):
        c = Caption(text='Caption line #1')

        self.assertRaises(AttributeError, setattr, c, 'text', 123)

    def test_manipulate_lines(self):
        c = Caption(text=['Caption line #1', 'Caption line #2'])
        c.lines[0] = 'Caption line #1 updated'
        self.assertEqual(c.lines[0], 'Caption line #1 updated')

    def test_captions(self):
        self.webvtt.read(self._get_file('sample.vtt'))
        self.assertIsInstance(self.webvtt.captions, list)

    def test_captions_prevent_write(self):
        self.webvtt.read(self._get_file('sample.vtt'))
        self.assertRaises(AttributeError, setattr, self.webvtt, 'captions', [])

    def test_sequence_iteration(self):
        self.webvtt.read(self._get_file('sample.vtt'))
        self.assertIsInstance(self.webvtt[0], Caption)
        self.assertEqual(len(self.webvtt), len(self.webvtt.captions))

    def test_save_no_filename(self):
        webvtt = WebVTT()
        self.assertRaises(MissingFilenameError, webvtt.save)

    def test_malformed_start_timestamp(self):
        self.assertRaises(MalformedCaptionError, Caption, '01:00')

    def test_set_styles_from_text(self):
        style = Style()
        style.text = '::cue(b) {\n  color: peachpuff;\n}'
        self.assertListEqual(style.lines,
                             ['::cue(b) {', '  color: peachpuff;', '}'])

    def test_get_styles_as_text(self):
        style = Style()
        style.lines = ['::cue(b) {', '  color: peachpuff;', '}']
        self.assertEqual(style.text, '::cue(b) {color: peachpuff;}')

    def test_save_identifiers(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR)

        self.webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt'))
        self.webvtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'))

        with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1',
            '', 'second caption', '00:00:07.000 --> 00:00:11.890',
            'Caption text #2', '', '00:00:11.890 --> 00:00:16.320',
            'Caption text #3', '', '4', '00:00:16.320 --> 00:00:21.580',
            'Caption text #4', '', '00:00:21.580 --> 00:00:23.880',
            'Caption text #5', '', '00:00:23.880 --> 00:00:27.280',
            'Caption text #6'
        ]

        self.assertListEqual(lines, expected_lines)

    def test_save_updated_identifiers(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR)

        self.webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt'))
        self.webvtt.captions[0].identifier = 'first caption'
        self.webvtt.captions[1].identifier = None
        self.webvtt.captions[3].identifier = '44'
        last_caption = Caption('00:00:27.280', '00:00:29.200',
                               'Caption text #7')
        last_caption.identifier = 'last caption'
        self.webvtt.captions.append(last_caption)
        self.webvtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'))

        with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT', '', 'first caption', '00:00:00.500 --> 00:00:07.000',
            'Caption text #1', '', '00:00:07.000 --> 00:00:11.890',
            'Caption text #2', '', '00:00:11.890 --> 00:00:16.320',
            'Caption text #3', '', '44', '00:00:16.320 --> 00:00:21.580',
            'Caption text #4', '', '00:00:21.580 --> 00:00:23.880',
            'Caption text #5', '', '00:00:23.880 --> 00:00:27.280',
            'Caption text #6', '', 'last caption',
            '00:00:27.280 --> 00:00:29.200', 'Caption text #7'
        ]

        self.assertListEqual(lines, expected_lines)
Example #20
0
def genarateSUB(url, lang):
    url = url
    language = lang
    # fine If any english captions available in the yputube url video
    captionTitle = Extract_Caption.extractTitle(url)

    videoName = Extract_Caption.download_video(url)
    wavFilePath = extractWavAudio.extractWAV(url)

    # spliting the audio file in to multiple audio
    AudioSplit.split(wavFilePath, captionTitle)

    # initiate the subtitle file path
    vtt = WebVTT()

    # initiate slite wav file
    num_files = len(os.listdir('../Datas/Splits/' + captionTitle + '/'))

    cnt = 0
    start = 0
    end = 5
    for i in range(1, num_files + 1):

        flag = 0
        text, confidence = ms_asr.transcribe('../Datas/Splits/' +
                                             captionTitle + '/' + str(i) +
                                             '.wav')
        print("Text: ", text)
        print("Confidence: ", confidence)
        if text == " ":
            translated_text = " "
        else:
            translated_text = TRANSLATR_TO_TEXT.translateFromTXT(
                text, language)
            flag = 1
            cnt += 1
        print("Translated Text: ", translated_text)
        if flag == 1:
            start_hours = start // 3600
            temp = start % 3600
            start_min = temp // 60
            start_sec = temp % 60
            end_hours = end // 3600
            temp = end % 3600
            end_min = temp // 60
            end_sec = temp % 60

            if (start_hours <= 9):
                start_hours = '0' + str(start_hours)
            else:
                start_hours = str(start_hours)
            if (start_min <= 9):
                start_min = '0' + str(start_min)
            else:
                start_min = str(start_min)
            if (start_sec <= 9):
                start_sec = '0' + str(start_sec)
            else:
                start_sec = str(start_sec)

            if (end_hours <= 9):
                end_hours = '0' + str(end_hours)
            else:
                end_hours = str(end_hours)
            if (end_min <= 9):
                end_min = '0' + str(end_min)
            else:
                end_min = str(end_min)
            if (end_sec <= 9):
                end_sec = '0' + str(end_sec)
            else:
                end_sec = str(end_sec)

            caption = Caption(
                start_hours + ':' + start_min + ':' + start_sec + '.001 ',
                end_hours + ':' + end_min + ':' + end_sec + '.000\n',
                str(translated_text) + '\n')

            vtt.captions.append(caption)
        start += 5
        end += 5

    vttFilePath = "../webApp/static/SubtitleFile/" + captionTitle + "_" + language + ".vtt"
    vtt.save(vttFilePath)
    vttName = captionTitle + "_" + language + ".vtt"
    files = glob.glob('../Datas/Splits/' + captionTitle + '/*')
    for f in files:
        os.remove(f)
    os.rmdir('../Datas/Splits/' + captionTitle)
    os.remove(wavFilePath)
    if language.__eq__('ta'):
        retlan = 'Tamil'
    if language.__eq__('si'):
        retlan = 'Sinhala',
    return videoName, vttName
Example #21
0
    def add_word(self,
                 word,
                 collection,
                 start,
                 end,
                 name,
                 add_type,
                 word_type,
                 group,
                 word_id='',
                 wordset_id=''):
        clean_word = word.strip()
        puresave_filename = name.split('.')[0] + "~" + clean_word
        # row = {
        #     "videaname":puresave_filename,
        #     "wordbase_type":"video"
        # }
        # w = WordbaseHelper()
        # w.init_word(row,clean_word)
        # w.insert(row,collection)

        data = {
            'filename': puresave_filename,
            'wordbase_collection': collection,
            'word': word,
            'add_type': add_type,
            'word_type': word_type,
            'group': group,
            'word_id': word_id,
            'wordset_id': wordset_id,
        }

        work_dir = "D:\BaiduYunDownload"
        file_path = ""
        parent_path = ""
        double_loop_flag = False
        for parent, dirnames, filenames in os.walk(work_dir, followlinks=True):
            for filename in filenames:
                if filename == name:
                    parent_path = parent
                    file_path = os.path.join(parent, filename)
                    double_loop_flag = True
                    break
            if double_loop_flag:
                break
        start_time = float(start)
        end_time = float(end)
        pure_filename = name.split('.')[0]

        subfile_path = os.path.join(parent_path, pure_filename + ".srt")
        video_clip = VideoFileClip(file_path)
        clip = video_clip.subclip(start_time, end_time)
        target = "D:\BaiduYunDownload\\videos\\" + puresave_filename + ".mp4"
        clip.write_videofile(target,
                             codec='libx264',
                             verbose=False,
                             audio=True)
        video_clip.close()

        subtitle = SSAFile.load(subfile_path)
        text = '''
        1
        00:00:00,000 --> 00:00:00,000
        
        '''
        temp = SSAFile().from_string(text)
        for sub in subtitle:
            if sub.start >= start_time * 1000 and sub.end <= end_time * 1000:
                text = sub.text.replace(
                    clean_word, '<c.video-heightlight>' + clean_word + '</c>')
                sub.text = text
                sub.shift(s=-start_time)
                temp.append(sub)
        sub_target = "D:\BaiduYunDownload\\videos\\" + puresave_filename
        temp.save(sub_target + '.srt')
        vtt = WebVTT().from_srt(sub_target + '.srt')
        vtt.save(sub_target + '.vtt')

        files = {
            "video": open(target, "rb"),
            "subtitle": open(sub_target + '.vtt', "rb")
        }
        # print(files)

        # r = requests.post('http://127.0.0.1:5000/video', data=data,files=files)
        r = requests.post('http://' + server_ip + '/video',
                          data=data,
                          files=files)
        # print(r.request)

        return "true"