def vttToWebCaptions(operator_object, vttObject): webcaptions = [] # Get metadata s3 = boto3.client('s3') try: print("Getting data from s3://" + vttObject["Bucket"] + "/" + vttObject["Key"]) data = s3.get_object(Bucket=vttObject["Bucket"], Key=vttObject["Key"]) vtt = data['Body'].read().decode('utf-8') except Exception as e: operator_object.update_workflow_status("Error") operator_object.add_workflow_metadata( WebCaptionsError="Unable read VTT file. " + str(e)) raise MasExecutionError(operator_object.return_output_object()) buffer = StringIO(vtt) for caption in webvtt.read_buffer(buffer): webcaption = {} webcaption["start"] = formatTimeVTTtoSeconds(caption.start) webcaption["end"] = formatTimeVTTtoSeconds(caption.end) webcaption["caption"] = caption.text webcaptions.append(webcaption) return webcaptions
def parse_webvtt_subtitles_to_text(subtitle_data): """ Return values: subtitles, as a text string retryable_error, boolean: if we should discard this and try again later (e.g. a weird network error or rate-limiting) non-rettryable_error, boolean: if we shouldn't retry, e.g. because there were no subtitles """ if subtitle_data and SUBTITLE_RATE_LIMIT_STRING in subtitle_data: log.info("subtitle_data {}".format(subtitle_data)) return None, True, False # if we're rate-limited, it's a retryable error elif subtitle_data: subtitle_lines = [ caption.text for caption in webvtt.read_buffer(StringIO(subtitle_data)) if caption.text.strip() != '' ] subtitle_lines_deduped = [subtitle_lines[0]] for line_a, line_b in zip(subtitle_lines[:-1], subtitle_lines[1:]): if line_a not in line_b: subtitle_lines_deduped.append(line_b) subs = '\n'.join(subtitle_lines_deduped) return subs, False, False else: subs = None return subs, False, True # if there's no subtitle data, it's a non-retryable error
def _get_sentences(self, file_uri: str) -> List[Dict[str, Union[str, float]]]: # Create file-like object of caption file's content buffer = io.StringIO(self._request_caption_content(file_uri)) # Get list of caption block captions = webvtt.read_buffer(buffer).captions buffer.close() # Create timestamped sentences sentences = [] # List of text, representing a sentence lines = [] start_time = 0 for caption in captions: start_time = start_time or caption.start_in_seconds lines.append(caption.text) end_sentence_search = re.search(self.end_of_sentence_pattern, caption.text) # Caption block is a end of sentence block if end_sentence_search: sentence = {'start_time': start_time, 'end_time': caption.end_in_seconds, 'text': ' '.join(lines)} sentences.append(sentence) # Reset lines and start_time, for start of new sentence lines = [] start_time = 0 # If any leftovers in lines, add a sentence for that. if lines: sentences.append({'start_time': start_time, 'end_time': captions[-1].end_in_seconds, 'text': ' '.join(lines)}) return sentences
def vttToCaptions(self, vttObject): captions = [] vtt = "" # Get metadata s3 = boto3.client('s3') try: self.logger.debug("Getting data from s3://" + vttObject["Bucket"] + "/" + vttObject["Key"]) vtt = S3Helper().readFromS3(vttObject["Bucket"], vttObject["Key"]) self.logger.debug(vtt) except Exception as e: #Fix me self.logger.error(e) buffer = StringIO(vtt) for vttcaption in webvtt.read_buffer(buffer): caption = {} caption["start"] = self.formatTimeVTTtoSeconds(vttcaption.start) caption["end"] = self.formatTimeVTTtoSeconds(vttcaption.end) caption["caption"] = vttcaption.text captions.append(caption) return captions
def test_read_memory_buffer(self): payload = '' with open(self._get_file('sample.vtt'), 'r', encoding='utf-8') as f: payload = f.read() buffer = io.StringIO(payload) vtt = webvtt.read_buffer(buffer) self.assertIsInstance(vtt.captions, list)
def _parse_subs(self, subs: str) -> List[str]: buffer = StringIO(subs) lines = [] for caption in webvtt.read_buffer(buffer): try: lines.append(caption.text) except: pass return lines
def _get_captions( self, closed_caption_content: str ) -> List[webvtt.structures.Caption]: # Create file-like object of caption file's content buffer = io.StringIO(closed_caption_content) # Get list of caption blocks captions = webvtt.read_buffer(buffer).captions buffer.close() return captions
def scrape_3c_media(url): transcript_log = [] with get_session() as ses: title, config = get_w3_info(ses.get(url)) for video in config['playlist']: for track in video['tracks']: with StringIO(ses.get(track['file']).text) as captions: for caption in webvtt.read_buffer(captions): transcript_log.append((caption.start, caption.text)) return (title, transcript_log)
def translateVTT(subid: ObjectId, language: str, translator: str): sub_obj = db.subtitles.find_one({'_id': subid}) if sub_obj is None: raise UserError('ITEM_NOT_FOUND') if sub_obj['format'] != 'vtt': raise UserError('ONLY_VTT_SUPPORTED') with redis_lock.Lock(rdb, "subtitleEdit:" + str(subid)), MongoTransaction(client) as s: cache = db.subtitle_translation_cache.find_one( { "subid": subid, "lang": language, "translator": translator }, session=s()) if cache is None or cache['version'] < sub_obj['meta']['modified_at']: # cache miss vtt = webvtt.read_buffer(io.StringIO(sub_obj['content'])) if translator == 'googletrans': result = translate_google(vtt, language) elif translator == 'baidutrans': with redis_lock.Lock(rdb, "lock-baidutrans"): result = translate_baidu(vtt, language) else: raise UserError('UNSUPPORTED_TRANSLATOR') if cache is None: db.subtitle_translation_cache.insert_one( { 'subid': subid, 'translator': translator, 'lang': language, 'version': sub_obj['meta']['modified_at'], 'content': result }, session=s()) else: db.subtitle_translation_cache.update_one( {'_id': cache['_id']}, { '$set': { 'version': sub_obj['meta']['modified_at'], 'content': result } }, session=s()) s.mark_succeed() return result else: # cache hit return cache['content']
def translate_captions_file(inbuf, outbuf, method='inplace'): '''Translates captions from input buffer to output buffer''' captions = webvtt.read_buffer(inbuf) # Preprocess encode_names(captions) fix_hyphenation(captions) # Main translate_texts(captions, method) # Postprocess encode_names(captions, back=True) revert_hyphenation(captions) captions.write(outbuf)
def test_read_memory_buffer_carriage_return(self): """https://github.com/glut23/webvtt-py/issues/29""" buffer = io.StringIO(textwrap.dedent('''\ WEBVTT\r \r 00:00:00.500 --> 00:00:07.000\r Caption text #1\r \r 00:00:07.000 --> 00:00:11.890\r Caption text #2\r \r 00:00:11.890 --> 00:00:16.320\r Caption text #3\r ''')) vtt = webvtt.read_buffer(buffer) self.assertEqual(len(vtt.captions), 3)
def infer_vtt_indic_en(): start_time = time.time() model, source_lang, target_lang = get_inference_params() source_text = request.form['text'] # vad_segments = request.form['vad_nochunk'] # Assuming it is an array of start & end timestamps vad = webvtt.read_buffer(StringIO(source_text)) source_sentences = [ v.text.replace('\r', '').replace('\n', ' ') for v in vad ] ## SUMANTH LOGIC HERE ## # for each vad timestamp, do: large_sentence = ' '.join( source_sentences) # only sentences in that time range large_sentence = large_sentence.lower() # split_sents = sentence_split(large_sentence, 'en') # print(split_sents) large_sentence = re.sub(r'[^\w\s]', '', large_sentence) punctuated = rpunct.punctuate(large_sentence, batch_size=32) end_time = time.time() print("Time Taken for punctuation: {} s".format(end_time - start_time)) start_time = time.time() split_sents = splitter([punctuated]) ### Please uncomment # print(split_sents) # output_sentence_punctuated = model.translate_paragraph(punctuated, source_lang, target_lang) output_sents = model.batch_translate(split_sents, source_lang, target_lang) # print(output_sents) # output_sents = split_sents # print(output_sents) # align this to those range of source_sentences in `captions` map_ = {split_sents[i]: output_sents[i] for i in range(len(split_sents))} # print(map_) punct_para = ' '.join(list(map_.keys())) nmt_para = ' '.join(list(map_.values())) nmt_words = nmt_para.split(' ') len_punct = len(punct_para.split(' ')) len_nmt = len(nmt_para.split(' ')) start = 0 for i in range(len(vad)): if vad[i].text == '': continue len_caption = len(vad[i].text.split(' ')) frac = (len_caption / len_punct) # frac = round(frac, 2) req_nmt_size = floor(frac * len_nmt) # print(frac, req_nmt_size) vad[i].text = ' '.join(nmt_words[start:start + req_nmt_size]) # print(vad[i].text) # print(start, req_nmt_size) start += req_nmt_size end_time = time.time() print("Time Taken for translation: {} s".format(end_time - start_time)) # vad.save('aligned.vtt') return { 'text': vad.content, # 'duration':round(end_time-start_time, 2) }
def test_read_malformed_buffer(self): malformed_payloads = ['', 'MOCK MELFORMED CONTENT'] for payload in malformed_payloads: buffer = io.StringIO(payload) with self.assertRaises(MalformedFileError): webvtt.read_buffer(buffer)
def test_read_file_buffer(self): with open(self._get_file('sample.vtt'), 'r', encoding='utf-8') as f: vtt = webvtt.read_buffer(f) self.assertIsInstance(vtt.captions, list)
def print_text_from_vtt(inbuf): captions = webvtt.read_buffer(inbuf) text = '\n'.join(c.text for c in captions) text = re.sub('-\n-', '', text) print(text)