def write_to_subtitles(data: Union[List[dict], List[List[dict]]]): """Returns WebVTT object from data. Args: data (Union[List[dict], List[List[dict]]]): data must be either a 'word'-type tier with a list of dicts that have keys for 'start', 'end' and 'text'. Or a 'sentence'-type tier with a list of lists of dicts. Returns: WebVTT: WebVTT subtitles """ vtt = WebVTT() for caption in data: if isinstance(caption, list): formatted = Caption( float_to_timedelta(caption[0]["start"]), float_to_timedelta(caption[-1]["end"]), " ".join([w["text"] for w in caption]), ) else: formatted = Caption( float_to_timedelta(caption["start"]), float_to_timedelta(caption["end"]), caption["text"], ) vtt.captions.append(formatted) return vtt
def enrichment_to_vtt(list_enrichment, video): webvtt = WebVTT() for enrich in list_enrichment: start = datetime.datetime.utcfromtimestamp( enrich.start).strftime("%H:%M:%S.%f")[:-3] end = datetime.datetime.utcfromtimestamp( enrich.end).strftime("%H:%M:%S.%f")[:-3] url = enrichment_to_vtt_type(enrich) caption = Caption( "{0}".format(start), "{0}".format(end), [ "{", '"title": "{0}",'.format(enrich.title), '"type": "{0}",'.format(enrich.type), '"stop_video": "{0}",'.format("%s" % 1 if enrich.stop_video else 0), '"url": "{0}"'.format(url), "}", ], ) caption.identifier = enrich.slug webvtt.captions.append(caption) temp_vtt_file = NamedTemporaryFile(suffix=".vtt") with open(temp_vtt_file.name, "w") as f: webvtt.write(f) if FILEPICKER: videodir, created = UserFolder.objects.get_or_create(name="%s" % video.slug, owner=video.owner) previousEnrichmentFile = CustomFileModel.objects.filter( name__startswith="enrichment", folder=videodir, created_by=video.owner, ) for enr in previousEnrichmentFile: enr.delete() # do it like this to delete file enrichmentFile, created = CustomFileModel.objects.get_or_create( name="enrichment", folder=videodir, created_by=video.owner) if enrichmentFile.file and os.path.isfile(enrichmentFile.file.path): os.remove(enrichmentFile.file.path) else: enrichmentFile, created = CustomFileModel.objects.get_or_create() enrichmentFile.file.save("enrichment.vtt", File(temp_vtt_file)) enrichmentVtt, created = EnrichmentVtt.objects.get_or_create(video=video) enrichmentVtt.src = enrichmentFile enrichmentVtt.save() return enrichmentFile.file.path
def transcribe(): command = [ 'ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ] process = subprocess.Popen(command, stdout=subprocess.PIPE) results = [] while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): results.append(rec.Result()) results.append(rec.FinalResult()) vtt = WebVTT() for i, res in enumerate(results): words = json.loads(res).get('result') if not words: continue start = timeString(words[0]['start']) end = timeString(words[-1]['end']) content = ' '.join([w['word'] for w in words]) caption = Caption(start, end, textwrap.fill(content)) vtt.captions.append(caption) # save or return webvtt if len(sys.argv) > 2: vtt.save(sys.argv[2]) else: print(vtt.content)
def create_overview_vtt(video_id, nb_img, image, duration, overviewfilename): msg = "\ncreate overview vtt file" image_width = image["image_width"] image_height = image["image_height"] image_url = image["image_url"] # creating webvtt file webvtt = WebVTT() for i in range(0, nb_img): if nb_img == 99: start = format(float(duration * i / 100), '.3f') end = format(float(duration * (i + 1) / 100), '.3f') else: start = format(float(i), '.3f') end = format(float(i + 1), '.3f') start_time = time.strftime('%H:%M:%S', time.gmtime(int(str(start).split('.')[0]))) start_time += ".%s" % (str(start).split('.')[1]) end_time = time.strftime( '%H:%M:%S', time.gmtime(int( str(end).split('.')[0]))) + ".%s" % (str(end).split('.')[1]) caption = Caption( '%s' % start_time, '%s' % end_time, '%s#xywh=%d,%d,%d,%d' % (image_url, image_width * i, 0, image_width, image_height)) webvtt.captions.append(caption) webvtt.save(overviewfilename) if check_file(overviewfilename): msg += "\n- overviewfilename :\n%s" % overviewfilename else: msg = "overviewfilename Wrong file or path : "\ + "\n%s" % overviewfilename add_encoding_log(video_id, msg) change_encoding_step(video_id, -1, msg) send_email(msg, video_id) return msg
def file_writing(path): vtt = WebVTT() caption = Caption() emotion = "" for line in webvtt.read('static/subtitle.vtt'): emotion = predict(str(line.text)) if emotion is "joy": caption = Caption( line.start, line.end, "<c.green> " + emotion + ": " + line.text + "</c>") elif emotion is "fear": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "anger": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "sadness": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "neutral": caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") else: caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") vtt.captions.append(caption) vtt.save('static/my_captions.vtt')
def generate_vtt_file(all_preds, logits, save_path): vtt = WebVTT() predictions = all_preds labels, starts, ends = get_labels_start_end_time(predictions, [1]) # smaller boundaries for ix in range(len(labels)): if ix == len(labels)-1: break diff = starts[ix+1]-ends[ix] starts[ix+1] -= floor(diff/2) ends[ix] += floor(diff/2) # load i3d classes i3d_scores = logits with open('data/info/bslcp/info.pkl', 'rb') as f: info_data = pickle.load(f) # for start, end in zip(starts, ends): for start, end in zip(starts, ends): if logits is not None: i3d_score = np.sum(np.asarray(i3d_scores)[start:end], axis=0) ind = np.argpartition(i3d_score, -10)[-10:] ind = ind[np.argsort(-i3d_score[ind])] classes = [info_data['words'][ix] for ix in ind] class_str = ','.join(classes) else: class_str = '' start = (start + 8) / 25 end = (end + 8) / 25 start_dt = datetime.timedelta(seconds=start) start_str = str(start_dt) if '.' not in start_str: start_str = f'{start_str}.000000' end_dt = datetime.timedelta(seconds=end) end_str = str(end_dt) if '.' not in end_str: end_str = f'{end_str}.000000' # creating a caption with a list of lines caption = Caption( start_str, end_str, [class_str] ) # adding a caption vtt.captions.append(caption) # save to a different file vtt.save(f'{save_path}/demo.vtt')
def annotations_to_webvtt(self, annotations): webvtt = WebVTT() last_index = len(annotations) - 1 index = 0 while index <= last_index: focus_annotation = annotations[index] # print("Focus:: BT: %d | ET: %d | Value: %s" % focus_annotation[:3]) if index == last_index: caption = Caption( self.time_to_webvtt_time(focus_annotation[0]), self.time_to_webvtt_time(focus_annotation[1]), [focus_annotation[2]]) # print("%s %s %s" % (caption.start, caption.end, caption.text)) webvtt.captions.append(caption) index += 1 else: for index_next in range(index + 1, last_index + 1): index = index_next next_annotation = annotations[index_next] # print("Next :: BT: %d | ET: %d | Value: %s" % next_annotation[:3]) overlap = self.check_overlap(focus_annotation, next_annotation) if overlap: # print("#%s#%s#" % (focus_annotation[2], next_annotation[2])) if not (focus_annotation[2] == next_annotation[2]): caption = Caption( self.time_to_webvtt_time(focus_annotation[0]), self.time_to_webvtt_time(next_annotation[0]), [focus_annotation[2]]) # print("%s %s %s" % (caption.start, caption.end, caption.text)) webvtt.captions.append(caption) break else: caption = Caption( self.time_to_webvtt_time(focus_annotation[0]), self.time_to_webvtt_time( min(focus_annotation[1], next_annotation[0])), [focus_annotation[2]]) # print("%s %s %s" % (caption.start, caption.end, caption.text)) webvtt.captions.append(caption) break return webvtt
def store_remote_transcripting_video(video_id): # msg = "" video_to_encode = Video.objects.get(id=video_id) output_dir = create_outputdir(video_id, video_to_encode.video.path) info_video = {} if check_file(output_dir + "/transcript.json"): with open(output_dir + "/transcript.json") as json_file: info_video = json.load(json_file) print_if_debug(output_dir) print_if_debug(json.dumps(info_video, indent=2)) webvtt = WebVTT() # They're sorted by confidence. First one is highest confidence result. words = info_video["transcripts"][0]["words"] """ for transcript in info_video["transcripts"]: for word in transcript["words"]: words.append(word) """ text_caption = [] start_caption = None duration = 0 for word in words: text_caption.append(word['word']) if start_caption is None: start_caption = word['start_time'] if duration + word['duration'] > SENTENCE_MAX_LENGTH: caption = Caption( format_time_caption(start_caption), format_time_caption(start_caption + duration + word['duration']), " ".join(text_caption)) webvtt.captions.append(caption) text_caption = [] start_caption = None duration = 0 else: duration += word['duration'] print_if_debug(webvtt) msg += saveVTT(video_to_encode, webvtt) add_encoding_log(video_id, msg) change_encoding_step(video_id, 0, "done") # envois mail fin transcription if EMAIL_ON_TRANSCRIPTING_COMPLETION: send_email_transcript(video_to_encode) else: msg += "Wrong file or path : "\ + "\n%s" % video_to_encode.video.path add_encoding_log(video_id, msg) change_encoding_step(video_id, -1, msg) send_email(msg, video_id)
def translate(self): newVTT = WebVTT() fileName = self.fileNameWOType + '.vtt' for caption in webvtt.read(fileName): # print(caption.start) # print(caption.end) # print(caption.text) translation = Translate.AWSTranslate.translate_text( Text=caption.text, SourceLanguageCode=self.sourceLanguage, TargetLanguageCode=self.targetLanguage) newCaption = Caption(caption.start, caption.end, translation.get('TranslatedText')) newCaption.identifier = caption.identifier newVTT.captions.append(newCaption) translatedFileName = self.fileNameWOType + '_' + self.targetLanguage + '.vtt' newVTT.save(translatedFileName) return 1
def createCaption(arg, rate): start, end, text = arg start = format(start / rate, '.3f') end = format(end / rate, '.3f') start_time = time.strftime('%H:%M:%S', time.gmtime(int(str(start).split('.')[0]))) start_time += ".%s" % (str(start).split('.')[1]) end_time = time.strftime('%H:%M:%S', time.gmtime(int(str(end).split('.')[0]))) end_time += ".%s" % (str(end).split('.')[1]) caption = Caption('%s' % start_time, '%s' % end_time, '%s' % text) return caption
def text_extract(): try: # creating a folder named data if os.path.exists('static/Text'): shutil.rmtree('static/Text') os.makedirs('static/Text') # if not created then raise error except OSError: print('Error: Creating directory of data') vtt_pos = WebVTT() vtt_neg = WebVTT() vtt_neu = WebVTT() caption = Caption() emotion = "" for line in webvtt.read('static/subtitle.vtt'): emotion = predict(str(line.text)) if emotion == "joy": caption = Caption( line.start, line.end, "<c.green> " + emotion + ": " + line.text + "</c>") vtt_pos.captions.append(caption) elif emotion == "anger" or emotion == "sadness" or emotion == "fear": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") vtt_neg.captions.append(caption) elif emotion == "neutral": caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") vtt_neu.captions.append(caption) vtt_pos.save('static/Text/positive.vtt') vtt_neg.save('static/Text/negative.vtt') vtt_neu.save('static/Text/neutral.vtt')
def get(self, request, *args, **kwargs): feed = self.get_object() try: stream = get_object_or_404(feed.streams.all(), uuid=request.GET["stream"]) except KeyError: return HttpResponseBadRequest(_("Bad request")) webvtt = WebVTT() resp = HttpResponse(content_type="text/vtt; charset=utf-8") try: start = parse_datetime(request.GET["start"]) end = parse_datetime(request.GET["end"]) epoch = parse_datetime(request.GET["epoch"]) except KeyError: return HttpResponseBadRequest(_("Bad request")) if stream.program_date_time: start_diff = start - stream.started_at end_diff = end - stream.started_at start = stream.program_date_time + start_diff end = stream.program_date_time + end_diff epoch = stream.program_date_time start = start - timedelta(seconds=5) end = end + timedelta(seconds=5) items = feed.items.filter(starts_at__gte=start, ends_at__lt=end).order_by( "starts_at" ) for item in items: start_timecode = self.get_vtt_timecode(epoch, item.starts_at) end_timecode = self.get_vtt_timecode(epoch, item.ends_at) data = { "uuid": item.uuid, "starts_at": item.starts_at.isoformat(), "ends_at": item.ends_at.isoformat(), "start_timecode": start_timecode, "end_timecode": end_timecode, "payload": item.payload, } cap = Caption( start_timecode, end_timecode, [json.dumps(data, cls=DjangoJSONEncoder)] ) webvtt.captions.append(cap) webvtt.write(resp) return resp
def read_from_existing_vtt(bucket_name, file_name): vtt = WebVTT() blob = read_data_from_storage(bucket_name, file_name) blob = [ string for string in blob.decode("utf-8").split('\n')[2:] if string ] start, end = '', '' for string in blob: if '-->' in string: start, end = string.split(' --> ') else: caption = Caption(start, end, string) vtt.captions.append(caption) return vtt
def generate_subtitles(input_file_name, output_file_name): if not os.path.isfile(input_file_name): print("Invalid file name") return f_content = open_file(input_file_name) vtt = WebVTT() count = 0 prev_start = "" prev_end = "" prev_str = "" for line in f_content: if line.startswith('Word'): # print(line, prev_str) l = line.strip() + '\n' m = re.search('Word: (.+?), start_time: (.+?), end_time: (.+?)\n', line) current_start = str(timedelta(seconds = float(m.group(2)), microseconds = 1)) current_end = str(timedelta(seconds = float(m.group(3)), microseconds = 1)) if(count < 6): if(count == 0): prev_start = current_start count += 1 prev_end = current_end prev_str += " " + m.group(1) if(count == 6): #caption = Caption( # str(timedelta(seconds = float(m.group(2)), microseconds = 1)), # str(timedelta(seconds = float(m.group(3)), microseconds = 1)), # str(m.group(1))) caption = Caption( prev_start, prev_end, prev_str) vtt.captions.append(caption) count = 0 prev_start = "" prev_end = "" prev_str = "" write_vtt(vtt, output_file_name)
def process_video_url(url, pk): vid_id = get_youtube_vid_id(url) captions = YouTubeTranscriptApi.get_transcript(video_id=vid_id) vtt = WebVTT() for t in captions: start = datetime.timedelta(milliseconds=t["start"] * 1000) end = datetime.timedelta(milliseconds=t["duration"] * 1000) + start if "." not in str(start): start = str(start) + ".000" if "." not in str(end): end = str(end) + ".000" caption = Caption( start=str(start), end=str(end), text=t["text"] ) vtt.captions.append(caption) if not os.path.isdir(CACHE): os.mkdir(CACHE) path = os.path.join(CACHE, f"{vid_id}.vtt") vtt.save(path) transcript = File(open(path, "rb")) os.remove(path) obj = VidSpark.management.models.Video.objects.get(pk=pk) obj.transcript = transcript obj.save()
vtt = WebVTT() vtt.read(filename) stmp = StringIO() print("<div>", file=stmp) for caption in vtt: print('<span data-start="{}" data-end="{}">{}</span>'.format( caption.start, caption.end, caption.text), file=stmp) print("</div>", file=stmp) # Translate driver = TranslationDriver(args.lang) strans = driver.translate(stmp.getvalue()) # Convert translated HTML back to VTT vtt = WebVTT() soup = BeautifulSoup(strans, "lxml") for span in soup.find_all("span"): start = span["data-start"] end = span["data-end"] caption = Caption(start, end, span.text) vtt.captions.append(caption) # Remove the english file os.remove(filename) outfile = filename.replace(".en.", ".{}.".format(args.lang)) vtt.save(outfile) print(green(outfile, bold=True))
def test_single_invalid_caption(self): self.assertRaises( InvalidCaptionsError, self.segmenter.segment, [Caption(), Caption(), 'text', Caption()] )
def write_caption(self, start, end, line): caption = Caption(start, end, line) self.vtt.captions.append(caption)
def genarateSUB(url, lang): url = url language = lang # fine If any english captions available in the yputube url video captionTitle = Extract_Caption.extractTitle(url) videoName = Extract_Caption.download_video(url) wavFilePath = extractWavAudio.extractWAV(url) # spliting the audio file in to multiple audio AudioSplit.split(wavFilePath, captionTitle) # initiate the subtitle file path vtt = WebVTT() # initiate slite wav file num_files = len(os.listdir('../Datas/Splits/' + captionTitle + '/')) cnt = 0 start = 0 end = 5 for i in range(1, num_files + 1): flag = 0 text, confidence = ms_asr.transcribe('../Datas/Splits/' + captionTitle + '/' + str(i) + '.wav') print("Text: ", text) print("Confidence: ", confidence) if text == " ": translated_text = " " else: translated_text = TRANSLATR_TO_TEXT.translateFromTXT( text, language) flag = 1 cnt += 1 print("Translated Text: ", translated_text) if flag == 1: start_hours = start // 3600 temp = start % 3600 start_min = temp // 60 start_sec = temp % 60 end_hours = end // 3600 temp = end % 3600 end_min = temp // 60 end_sec = temp % 60 if (start_hours <= 9): start_hours = '0' + str(start_hours) else: start_hours = str(start_hours) if (start_min <= 9): start_min = '0' + str(start_min) else: start_min = str(start_min) if (start_sec <= 9): start_sec = '0' + str(start_sec) else: start_sec = str(start_sec) if (end_hours <= 9): end_hours = '0' + str(end_hours) else: end_hours = str(end_hours) if (end_min <= 9): end_min = '0' + str(end_min) else: end_min = str(end_min) if (end_sec <= 9): end_sec = '0' + str(end_sec) else: end_sec = str(end_sec) caption = Caption( start_hours + ':' + start_min + ':' + start_sec + '.001 ', end_hours + ':' + end_min + ':' + end_sec + '.000\n', str(translated_text) + '\n') vtt.captions.append(caption) start += 5 end += 5 vttFilePath = "../webApp/static/SubtitleFile/" + captionTitle + "_" + language + ".vtt" vtt.save(vttFilePath) vttName = captionTitle + "_" + language + ".vtt" files = glob.glob('../Datas/Splits/' + captionTitle + '/*') for f in files: os.remove(f) os.rmdir('../Datas/Splits/' + captionTitle) os.remove(wavFilePath) if language.__eq__('ta'): retlan = 'Tamil' if language.__eq__('si'): retlan = 'Sinhala', return videoName, vttName
def main_transcript(video_to_encode, ds_model): msg = "" inference_start = timer() msg += '\nInference start %0.3fs.' % inference_start mp3file = video_to_encode.get_video_mp3( ).source_file if video_to_encode.get_video_mp3() else None if mp3file is None: msg += "\n no mp3 file found for video :%s." % video_to_encode.id change_encoding_step(video_to_encode.id, -1, msg) send_email(msg, video_to_encode.id) return msg # NORMALIZE mp3file norm_mp3_file = normalize_mp3(mp3file.path) desired_sample_rate = ds_model.sampleRate() webvtt = WebVTT() last_item = None sentences = [] sentence = [] metadata = None for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME): end_trim = video_to_encode.duration if start_trim + \ AUDIO_SPLIT_TIME > video_to_encode.duration else ( start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \ AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \ else (video_to_encode.duration - start_trim) msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim, duration) audio = convert_samplerate(norm_mp3_file, desired_sample_rate, start_trim, duration) msg += '\nRunning inference.' metadata = ds_model.sttWithMetadata(audio) msg += '\nConfidence : %s' % metadata.confidence sentences[:] = [] # empty list sentence[:] = [] # empty list if len(metadata.items) > 0: refItem = metadata.items[0] index = get_index(metadata, last_item, start_trim) if last_item else 0 # nb of character in AUDIO_SPLIT_TIME msg += "METADATA ITEMS : %d " % len(metadata.items) sentences = get_sentences(metadata, refItem, index) last_item = ( sentences[-1][-1].character, sentences[-1][-1].start_time) if len(sentences) > 0 else () for sent in sentences: if len(sent) > 0: start_time = sent[0].start_time + start_trim end_time = sent[-1].start_time + start_trim str_sentence = ''.join(item.character for item in sent) # print(start_time, end_time, str_sentence) caption = Caption( '%s.%s' % (timedelta(seconds=int(str(start_time).split('.')[0])), str('%.3f' % start_time).split('.')[1]), '%s.%s' % (timedelta(seconds=int(str(end_time).split('.')[0])), str('%.3f' % end_time).split('.')[1]), ['%s' % str_sentence]) webvtt.captions.append(caption) # print(webvtt) msg += saveVTT(video_to_encode, webvtt) inference_end = timer() - inference_start msg += '\nInference took %0.3fs.' % inference_end # print(msg) change_encoding_step(video_to_encode.id, 0, "done") # envois mail fin transcription if EMAIL_ON_TRANSCRIPTING_COMPLETION: send_email_transcript(video_to_encode) return msg
def main_transcript(video_to_encode): msg = "" mp3file = video_to_encode.get_video_mp3( ).source_file if video_to_encode.get_video_mp3() else None lang = video_to_encode.main_lang # check if DS_PARAM [lang] exist if not DS_PARAM.get(lang): msg += "\n no deepspeech model found for lang:%s." % lang msg += "Please add it in DS_PARAM." return msg ds_model = Model(DS_PARAM[lang]['model'], DS_PARAM[lang]['beam_width']) if all([ cond in DS_PARAM[lang] for cond in ['alphabet', 'lm', 'trie', 'lm_alpha', 'lm_beta'] ]): ds_model.enableDecoderWithLM(DS_PARAM[lang]['lm'], DS_PARAM[lang]['trie'], DS_PARAM[lang]['lm_alpha'], DS_PARAM[lang]['lm_beta']) desired_sample_rate = ds_model.sampleRate() webvtt = WebVTT() inference_start = timer() last_item = None sentences = [] sentence = [] metadata = None for start_trim in range(0, video_to_encode.duration, AUDIO_SPLIT_TIME): end_trim = video_to_encode.duration if start_trim + \ AUDIO_SPLIT_TIME > video_to_encode.duration else ( start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) duration = (AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + \ AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < video_to_encode.duration \ else (video_to_encode.duration - start_trim) msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim, duration) audio = convert_samplerate(mp3file.path, desired_sample_rate, start_trim, duration) msg += '\nRunning inference.' metadata = ds_model.sttWithMetadata(audio) msg += '\nConfidence : %s' % metadata.confidence sentences[:] = [] # empty list sentence[:] = [] # empty list refItem = metadata.items[0] index = get_index(metadata, last_item, start_trim) if last_item else 0 # nb of character in AUDIO_SPLIT_TIME msg += "METADATA ITEMS : %d " % len(metadata.items) sentences = get_sentences(metadata, refItem, index) last_item = ( sentences[-1][-1].character, sentences[-1][-1].start_time) if len(sentences) > 0 else () for sent in sentences: if len(sent) > 0: start_time = sent[0].start_time + start_trim end_time = sent[-1].start_time + start_trim str_sentence = ''.join(item.character for item in sent) # print(start_time, end_time, str_sentence) caption = Caption( '%s.%s' % (timedelta(seconds=int(str(start_time).split('.')[0])), str('%.3f' % start_time).split('.')[1]), '%s.%s' % (timedelta(seconds=int(str(end_time).split('.')[0])), str('%.3f' % end_time).split('.')[1]), ['%s' % str_sentence]) webvtt.captions.append(caption) # print(webvtt) msg += saveVTT(video_to_encode, webvtt) inference_end = timer() - inference_start msg += '\nInference took %0.3fs.' % inference_end # print(msg) return msg
def main_transcript(norm_mp3_file, duration, ds_model): msg = "" inference_start = timer() msg += "\nInference start %0.3fs." % inference_start desired_sample_rate = ds_model.sampleRate() webvtt = WebVTT() last_word_added = "" metadata = None all_text = "" for start_trim in range(0, duration, AUDIO_SPLIT_TIME): end_trim = (duration if start_trim + AUDIO_SPLIT_TIME > duration else (start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH)) dur = ((AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH) if start_trim + AUDIO_SPLIT_TIME + SENTENCE_MAX_LENGTH < duration else (duration - start_trim)) msg += "\ntake audio from %s to %s - %s" % (start_trim, end_trim, dur) audio = convert_samplerate(norm_mp3_file, desired_sample_rate, start_trim, dur) msg += "\nRunning inference." metadata = ds_model.sttWithMetadata(audio) for transcript in metadata.transcripts: msg += "\nConfidence : %s" % transcript.confidence words = words_from_candidate_transcript(transcript) start_caption = start_trim + words[0]["start_time"] text_caption = [] is_first_caption = True for word in words: all_text += word["word"] + " " # word : <class 'dict'> {'word': 'bonjour', 'start_time ': # 0.58, 'duration': 7.34} text_caption.append(word["word"]) if not (((word["start_time"] + start_trim) - start_caption) < SENTENCE_MAX_LENGTH): # on créé le caption if is_first_caption: # A revoir, fusion de la nouvelle ligne avec # l'ancienne... is_first_caption = False text_caption = get_text_caption( text_caption, last_word_added) stop_caption = start_trim + word["start_time"] + word[ "duration"] # on evite le chevauchement change_previous_end_caption(webvtt, start_caption) caption = Caption( format_time_caption(start_caption), format_time_caption(stop_caption), " ".join(text_caption), ) webvtt.captions.append(caption) # on remet tout à zero pour la prochaine phrase start_caption = start_trim + word["start_time"] text_caption = [] last_word_added = word["word"] if start_trim + AUDIO_SPLIT_TIME > duration: # on ajoute ici la dernière phrase de la vidéo stop_caption = (start_trim + words[-1]["start_time"] + words[-1]["duration"]) caption = Caption( format_time_caption(start_caption), format_time_caption(stop_caption), " ".join(text_caption), ) webvtt.captions.append(caption) inference_end = timer() - inference_start msg += "\nInference took %0.3fs." % inference_end return msg, webvtt, all_text
def subtitle_generation(response, vtt, bin_size=3): """We define a bin of time period to display the words in sync with audio. Here, bin_size = 3 means each bin is of 3 secs. All the words in the interval of 3 secs in result will be grouped togather.""" # response = "videoIntelligence-response-to-API" transcribed_text = "" index = 0 flag = None for speech_transcription in response.annotation_results[ 0].speech_transcriptions: # The number of alternatives for each transcription is limited by # SpeechTranscriptionConfig.max_alternatives. # Each alternative is a different possible transcription # and has its own confidence score. for alternative in speech_transcription.alternatives: try: if alternative.words[0].start_time.seconds: # bin start -> for first word of result start_sec = alternative.words[0].start_time.seconds start_microsec = alternative.words[ 0].start_time.nanos * 0.001 else: # bin start -> For First word of response start_sec = 0 start_microsec = 0 end_sec = start_sec + bin_size # bin end sec # for last word of result last_word_end_sec = alternative.words[-1].end_time.seconds last_word_end_microsec = alternative.words[ -1].end_time.nanos * 0.001 # bin transcript transcript = alternative.words[0].word index += 1 # subtitle index for i in range(len(alternative.words) - 1): try: word = alternative.words[i + 1].word word_start_sec = alternative.words[ i + 1].start_time.seconds word_start_microsec = alternative.words[ i + 1].start_time.nanos * 0.001 # 0.001 to convert nana -> micro word_end_sec = alternative.words[i + 1].end_time.seconds word_end_microsec = alternative.words[ i + 1].end_time.nanos * 0.001 if word_end_sec < end_sec and not ( '!' in alternative.words[i].word or '?' in alternative.words[i].word or '.' in alternative.words[i].word): transcript = transcript + " " + word else: previous_word_end_sec = alternative.words[ i].end_time.seconds previous_word_end_microsec = alternative.words[ i].end_time.nanos * 0.001 # append bin transcript start = str( datetime.timedelta(0, start_sec, start_microsec))[:12] end = str( datetime.timedelta( 0, previous_word_end_sec, previous_word_end_microsec))[:12] if len(start) <= 8: start += ".000" if len(end) <= 8: end += ".000" if flag and flag == start: break if not (flag): flag = start caption = Caption(start, end, transcript) transcribed_text += transcript + " " vtt.captions.append(caption) # reset bin parameters start_sec = word_start_sec start_microsec = word_start_microsec end_sec = start_sec + bin_size transcript = alternative.words[i + 1].word index += 1 except IndexError: pass # append transcript of last transcript in bin start = str(datetime.timedelta(0, start_sec, start_microsec))[:12] end = str( datetime.timedelta(0, last_word_end_sec, last_word_end_microsec))[:12] if len(start) <= 8: start += ".000" if len(end) <= 8: end += ".000" if flag and flag == start: break if not (flag): flag = start caption = Caption(start, end, transcript) vtt.captions.append(caption) index += 1 except IndexError: pass # turn transcription list into subtitles return (transcribed_text, vtt)
# -*- coding: utf-8 -*- import pysrt import webvtt from webvtt import WebVTT, Caption subs = pysrt.open( 'Tanmay Bakshi - New Google Employee Indian Boy Going To Ninth Grade.srt', encoding='utf-8') vtt = WebVTT() for ligne in subs: print(str(ligne.start)) print(str(ligne.end)) print(str(ligne.text)) caption = Caption(str(ligne.start), str(ligne.end), str(ligne.text)) #print(caption.start) #print(caption.end) #print (var2) vtt.captions.append(caption) vtt.save('_fr.vtt')