def _index_ass(writer, object_id: str, evevnts: Iterable[ass.document.Dialogue]): for idx, event in enumerate(evevnts): writer.add_document(object_id=object_id, start=srt.timedelta_to_srt_timestamp(event.Start), end=srt.timedelta_to_srt_timestamp(event.End), content=event.Text, idx=idx)
def _index_srt(writer, object_id: str, subtitles: Iterable[srt.Subtitle]): for subtitle in subtitles: writer.add_document(object_id=object_id, start=srt.timedelta_to_srt_timestamp( subtitle.start), end=srt.timedelta_to_srt_timestamp(subtitle.end), content=subtitle.content, idx=subtitle.index)
def main(): """Parsing arguments and generating an XML file.""" parser = argparse.ArgumentParser(prog="find_word", description="Searching for strings in a subtitle file and generating an edit decision list") parser.add_argument("-i", "--inputfile", help="input .srt file", required=True) parser.add_argument("-o", "--outputfile", help="output .xml file") parser.add_argument("-w", "--word", help="search for word(s)", required=True) parser.add_argument("-c", "--cut", action="store_true", help="Automatically cutting the video file. (input video file, output video file)") parser.add_argument("-v", "--verbose", action='store_true', help="verbose mode") args = parser.parse_args() # Verbose mode if args.verbose: print('Reading subtitle .srt file', CRED + args.inputfile + CEND) if args.outputfile is not None: print('Output .XML file is', CRED + args.outputfile + CEND) print('Search word(s) is/are', CRED + args.word + CEND) if args.cut is not None: print(args.cut) subtitle = open(args.inputfile, "r") data = list(srt.parse(subtitle)) cut_list = pandas.DataFrame(columns=['start', 'end', 'content']) if args.outputfile is not None: xml = open(args.outputfile, "w") for i in range(len(data)): if args.word in data[i].content: start = srt.timedelta_to_srt_timestamp(data[i].start) end = srt.timedelta_to_srt_timestamp(data[i].end) # Verbose mode if args.verbose: print(start, end) if args.outputfile is not None: start = re.sub(',', '.', start) end = re.sub(',', '.', end) xml.write('''<entry producer="producer0" in="%s" out="%s" />\n''' %(start, end)) cut_list = cut_list.append({'start': start, 'end':end, 'content': data[i].content }, ignore_index=True) if args.cut: cut_list.to_pickle("./cut_list.pkl")
def srt_to_po_converter(src_fp, dest_fp): subtitles = srt.parse(src_fp.read().decode("utf-8-sig")) for cue in subtitles: lines = cue.content.splitlines() msgid = NEW_LINE_TAG.join(lines) start = srt.timedelta_to_srt_timestamp(cue.start) end = srt.timedelta_to_srt_timestamp(cue.end) comment = f"{cue.index}\n{start} --> {end}" unit = polib.POEntry(msgid=msgid, comment=comment) dest_fp.write(f"{unit}\n".encode("utf-8"))
def test_parser_noncontiguous_ignore_errors(subs, fake_idx, garbage, fake_timedelta): composed = srt.compose(subs) srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) composed = composed.replace( "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage)) # Should not raise, we have ignore_errors list(srt.parse(composed, ignore_errors=True))
def get_subtitles_from_file(filename, index, url=None, name=None): transcriptions = [] with open(f'data/subtitles/{filename}') as f: text = f.read() subtitles = srt.parse(text) f.close() for i, sub in enumerate(subtitles): transcriptions.append({ 'index': i + 1, 'start': srt.timedelta_to_srt_timestamp(sub.start), 'end': srt.timedelta_to_srt_timestamp(sub.end), 'content': sub.content }) return get_json_response(transcriptions, index, url, name)
def test_parser_noncontiguous(subs, fake_idx, garbage, fake_timedelta): composed = srt.compose(subs) # Put some garbage between subs that should trigger our failed parsing # detection. Since we do some magic to try and detect blank lines that # don't really delimit subtitles, it has to look at least a little like an # SRT block. srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) composed = composed.replace( "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage)) with pytest.raises(srt.SRTParseError): list(srt.parse(composed))
def test_parser_noncontiguous(subs, fake_idx, garbage, fake_timedelta): composed = srt.compose(subs) # Put some garbage between subs that should trigger our failed parsing # detection. Since we do some magic to try and detect blank lines that # don't really delimit subtitles, it has to look at least a little like an # SRT block. srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) composed = composed.replace( "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) ) with assert_raises(srt.SRTParseError): list(srt.parse(composed))
def test_parser_didnt_match_to_end_raises(subs, fake_idx, garbage, fake_timedelta): srt_blocks = [sub.to_srt() for sub in subs] srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) garbage = "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) srt_blocks.append(garbage) composed = "".join(srt_blocks) with assert_raises(srt.SRTParseError) as thrown_exc: list(srt.parse(composed)) # Since we will consume as many \n as needed until we meet the lookahead # assertion, leading newlines in `garbage` will be stripped. garbage_stripped = garbage.lstrip("\n") eq(garbage_stripped, thrown_exc.exception.unmatched_content) eq(len(composed) - len(garbage_stripped), thrown_exc.exception.expected_start) eq(len(composed), thrown_exc.exception.actual_start)
#%% import srt # %% with open('video\sub_rus2.srt', mode="r", encoding="utf-8") as input_file: res = list( srt.parse("\n".join([x.rstrip() for x in input_file.readlines()]))) # %% res[0] # %% srt.compose(res) # %% srt.timedelta_to_srt_timestamp(res[0].start) # %% import re res[0].content # %% s = "\nasd \tasd\nasd \t33 s\n\n asd" s = re.sub(r"\r?\n|r", '', s).replace("\t", '') print(s) # %% print(a) # %%
def test_srt_timestamp_to_timedelta_too_short_raises(ts): srt_ts = srt.timedelta_to_srt_timestamp(ts)[:-1] with assert_raises(ValueError): srt.srt_timestamp_to_timedelta(srt_ts)
def test_timedelta_to_srt_timestamp_can_go_over_24_hours(days): srt_timestamp = srt.timedelta_to_srt_timestamp(timedelta(days=days)) srt_timestamp_hours = int(srt_timestamp.split(':')[0]) eq(srt_timestamp_hours, days * HOURS_IN_DAY)
def composeSub(self, sub): """""" start = srt.timedelta_to_srt_timestamp(sub.start) end = srt.timedelta_to_srt_timestamp(sub.end) return f"{sub.index}\n{start} --> {end}\n{sub.content}\n\n"
def srt_to_frame(fps, st): tc = Timecode(fps, srt.timedelta_to_srt_timestamp(st).replace(",", ".")) return tc.frame_number - 86400 - 21600
def get_transcriptions(response, bin_size=3): transcriptions = [] index = 0 for result in response.results: try: if result.alternatives[0].words[0].start_time.seconds: # bin start -> for first word of result start_sec = result.alternatives[0].words[0].start_time.seconds start_microsec = result.alternatives[0].words[ 0].start_time.nanos * 0.001 else: # bin start -> For First word of response start_sec = 0 start_microsec = 0 end_sec = start_sec + bin_size # bin end sec # for last word of result last_word_end_sec = result.alternatives[0].words[ -1].end_time.seconds last_word_end_microsec = result.alternatives[0].words[ -1].end_time.nanos * 0.001 # bin transcript transcript = result.alternatives[0].words[0].word index += 1 # subtitle index for i in range(len(result.alternatives[0].words) - 1): try: word = result.alternatives[0].words[i + 1].word word_start_sec = result.alternatives[0].words[ i + 1].start_time.seconds word_start_microsec = result.alternatives[0].words[ i + 1].start_time.nanos * 0.001 # 0.001 to convert nana -> micro word_end_sec = result.alternatives[0].words[ i + 1].end_time.seconds word_end_microsec = result.alternatives[0].words[ i + 1].end_time.nanos * 0.001 if word_end_sec < end_sec: transcript = transcript + " " + word else: previous_word_end_sec = result.alternatives[0].words[ i].end_time.seconds previous_word_end_microsec = result.alternatives[ 0].words[i].end_time.nanos * 0.001 # append bin transcript # transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, previous_word_end_sec, previous_word_end_microsec), transcript)) transcriptions.append({ 'index': index, 'start': srt.timedelta_to_srt_timestamp( datetime.timedelta(0, start_sec, start_microsec)), 'end': srt.timedelta_to_srt_timestamp( datetime.timedelta( 0, previous_word_end_sec, previous_word_end_microsec)), 'content': transcript }) # reset bin parameters start_sec = word_start_sec start_microsec = word_start_microsec end_sec = start_sec + bin_size transcript = result.alternatives[0].words[i + 1].word index += 1 except IndexError: pass # append transcript of last transcript in bin # transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, start_sec, start_microsec), datetime.timedelta(0, last_word_end_sec, last_word_end_microsec), transcript)) transcriptions.append({ 'index': index, 'start': srt.timedelta_to_srt_timestamp( datetime.timedelta(0, start_sec, start_microsec)), 'end': srt.timedelta_to_srt_timestamp( datetime.timedelta(0, last_word_end_sec, last_word_end_microsec)), 'content': transcript }) index += 1 except IndexError: pass return transcriptions
def test_timedelta_to_srt_timestamp_can_go_over_24_hours(days): srt_timestamp = srt.timedelta_to_srt_timestamp(timedelta(days=days)) srt_timestamp_hours = int(srt_timestamp.split(":")[0]) assert srt_timestamp_hours == days * HOURS_IN_DAY
def test_bad_timestamp_format_raises(ts): ts = srt.timedelta_to_srt_timestamp(ts) ts = ts.replace(":", "t", 1) with pytest.raises(srt.TimestampParseError): srt.srt_timestamp_to_timedelta(ts)
captionCC = list(cp) CCPointList = [] for i in range(len(captionCC)): if captionCC[i].content.startswith("what") or captionCC[i].content.startswith("george"): CCPointList.append(i) print(captionCC[i].content) print(captionCC[i].start) print(CCPointList) filename = "Podcast1.mp4" n = 0 for j in range(len(CCPointList)): lastNumber = CCPointList[-1] n += 1 if CCPointList[j] >= 0 and CCPointList[j] != lastNumber: starttime = srt.timedelta_to_srt_timestamp(captionCC[CCPointList[j]].start) endtime = srt.timedelta_to_srt_timestamp(captionCC[CCPointList[j + 1]].start) clip = VideoFileClip(filename).subclip(starttime, endtime) clip.write_videofile(f"clip{n}.mp4") clip.close() elif CCPointList[j] == CCPointList[-1]: starttime = srt.timedelta_to_srt_timestamp(captionCC[CCPointList[j]].start) endtime = srt.timedelta_to_srt_timestamp(captionCC[-1].end) clip1 = VideoFileClip(filename).subclip(starttime, endtime) clip1.write_videofile(f"clip{n}.mp4") clip1.close()