def test_sort_and_reindex_not_in_place_matches(input_subs, start_index): # Make copies for both sort_and_reindex calls so that they can't affect # each other not_in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] nip_ids = [id(sub) for sub in not_in_place_subs] ip_ids = [id(sub) for sub in in_place_subs] not_in_place_output = list( srt.sort_and_reindex( not_in_place_subs, start_index=start_index, ), ) in_place_output = list( srt.sort_and_reindex( in_place_subs, start_index=start_index, in_place=True ), ) # The results in each case should be the same subs_eq(not_in_place_output, in_place_output) # Not in place sort_and_reindex should have created new subs assert_false(any(id(sub) in nip_ids for sub in not_in_place_output)) # In place sort_and_reindex should be reusing the same subs assert_true(all(id(sub) in ip_ids for sub in in_place_output))
def test_sort_and_reindex_not_in_place_matches(input_subs, start_index): # Make copies for both sort_and_reindex calls so that they can't affect # each other not_in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] in_place_subs = [srt.Subtitle(**vars(sub)) for sub in input_subs] nip_ids = [id(sub) for sub in not_in_place_subs] ip_ids = [id(sub) for sub in in_place_subs] not_in_place_output = list( srt.sort_and_reindex( not_in_place_subs, start_index=start_index, ), ) in_place_output = list( srt.sort_and_reindex(in_place_subs, start_index=start_index, in_place=True), ) # The results in each case should be the same subs_eq(not_in_place_output, in_place_output) # Not in place sort_and_reindex should have created new subs assert_false(any(id(sub) in nip_ids for sub in not_in_place_output)) # In place sort_and_reindex should be reusing the same subs assert_true(all(id(sub) in ip_ids for sub in in_place_output))
def __init__( self, id="", f=None, # one or many (list) file comment='##', set_id_as_prog=True, debug=False): if isinstance(f, str): f = [f] elif isinstance(f, list): pass else: raise ValueError("f must be a str path or a list of path") # id of the srt self.id = id # raw content content = [] for file in sorted(f): with open(file) as s: for line in s: if not line.startswith(comment): content.append(line) if debug: self.raw = content # parsing subs_generator = srt.parse("".join(content)) subs = list(subs_generator) if set_id_as_prog: subs = list(srt.sort_and_reindex(subs)) self.subs = subs
def merge(file_output, file_inputs): print('begin merging {} into {}'.format(file_inputs, file_output)) srt_inputs = [read_srt(file_input) for file_input in file_inputs] srt_merge = srt_inputs[0] if len(srt_inputs) >= 2: for srt_input in srt_inputs[1:]: srt_merge = merge_two_srt_lists(srt_merge, srt_input) write_srt(file_output, srt.sort_and_reindex(srt_merge)) print('end merging {} into {}'.format(file_inputs, file_output))
def test_sort_and_reindex_same_start_time_uses_end(input_subs): for sub in input_subs: # Pin all subs to same start time so that end time is compared only sub.start = timedelta(1) reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True)) # The subtitles should be sorted by end time when start time is the same expected_sorting = sorted(input_subs, key=lambda sub: sub.end) eq(reindexed_subs, expected_sorting)
def test_subs_starts_before_zero_removed(positive_subs, negative_subs, negative_td): for sub in negative_subs: sub.start = negative_td sub.end = negative_td # Just to avoid tripping any start >= end errors subs = positive_subs + negative_subs composed_subs = list(srt.sort_and_reindex(subs, in_place=True)) # There should be no negative subs subs_eq(composed_subs, positive_subs, any_order=True)
def test_sort_and_reindex_same_start_time_uses_end(input_subs): for sub in input_subs: # Pin all subs to same start time so that end time is compared only sub.start = timedelta(1) reindexed_subs = list(srt.sort_and_reindex(input_subs, in_place=True)) # The subtitles should be sorted by end time when start time is the same expected_sorting = sorted(input_subs, key=lambda sub: sub.end) assert reindexed_subs == expected_sorting
def test_sort_and_reindex_no_skip(input_subs): # end time > start time should not trigger a skip if skip=False for sub in input_subs: old_start = sub.start sub.start = sub.end sub.end = old_start reindexed_subs = list(srt.sort_and_reindex(input_subs, skip=False)) # Nothing should have been skipped assert len(reindexed_subs) == len(input_subs)
def main(argv): # Parse arguments primary_language = '' secondary_language = '' try: opts, args = getopt.getopt( argv, "hp:s:", ["primary-language=", "secondary-language="]) except getopt.GetoptError: print ('merge_subtitles.py -p <primary_language>' '-s <secondary_language>') sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print ('merge_subtitles.py -p <primary_language>' ' -s <secondary_language>') sys.exit() elif opt in ("-p", "--primary-language"): primary_language = arg elif opt in ("-s", "--secondary-language"): secondary_language = arg # Read files and convert to list primary_path = glob.glob('./*.' + primary_language + '.srt')[0] secondary_path = glob.glob('./*.' + secondary_language + '.srt')[0] primary_file = open(primary_path, 'r', errors='ignore') primary_text = primary_file.read() primary_file.close() secondary_file = open(secondary_path, 'r', errors='ignore') secondary_text = secondary_file.read() secondary_file.close() subtitle_generator_primary = srt.parse(primary_text) subtitles_primary = list(subtitle_generator_primary) subtitle_generator_secondary = srt.parse(secondary_text) subtitles_secondary = list(subtitle_generator_secondary) # Make primary yellow for s in subtitles_primary: s.content = '<font color="#ffff54">' + s.content + '</font>' # Place secondary on top for s in subtitles_secondary: s.content = '{\\an8}' + s.content # Merge subtitles_merged = subtitles_primary + subtitles_secondary subtitles_merged = list(srt.sort_and_reindex(subtitles_merged)) # Write merged to file merged_path = primary_path.replace(primary_language, 'merged') merged_text = srt.compose(subtitles_merged) merged_file = open(merged_path, 'w') merged_file.write(merged_text) merged_file.close()
def test_subs_missing_content_removed(content_subs, contentless_subs, contentless_text): for sub in contentless_subs: sub.content = contentless_text subs = contentless_subs + content_subs composed_subs = list(srt.sort_and_reindex(subs, in_place=True)) # We should have composed the same subs as there are in content_subs, as # all contentless_subs should have been stripped. subs_eq(composed_subs, content_subs, any_order=True) # The subtitles should be reindexed starting at start_index, excluding # contentless subs default_start_index = 1 assert [sub.index for sub in composed_subs] == list( range(default_start_index, default_start_index + len(composed_subs)))
def main(): n = int(input('Amount of .srt files: ')) subtitlesmerged = [] for i in (1, n): p = input('File no. %s: ' % i) with open(p, 'r') as file: data = file.read() data.replace(u'\ufeff', '') subtitles_gen = srt.parse(data) subtitles = list(subtitles_gen) subtitlesmerged = subtitlesmerged + subtitles subtitles = list(srt.sort_and_reindex(subtitlesmerged)) subtitles_composed = srt.compose(subtitles) outputpath = input('Write merged .srt file to: ') with open(outputpath, 'w') as output: output.write(subtitles_composed)
def test_subs_missing_content_removed(content_subs, contentless_subs, contentless_text): for sub in contentless_subs: sub.content = contentless_text subs = contentless_subs + content_subs composed_subs = list(srt.sort_and_reindex(subs, in_place=True)) # We should have composed the same subs as there are in content_subs, as # all contentless_subs should have been stripped. subs_eq(composed_subs, content_subs, any_order=True) # The subtitles should be reindexed starting at start_index, excluding # contentless subs default_start_index = 1 eq( [sub.index for sub in composed_subs], list(range(default_start_index, default_start_index + len(composed_subs))), )
def test_sort_and_reindex(input_subs, start_index): for sub in input_subs: # Pin all subs to same end time so that start time is compared only, # must be guaranteed to be < sub.start, see how # start_timestamp_strategy is done sub.end = timedelta(500001) reindexed_subs = list( srt.sort_and_reindex(input_subs, start_index=start_index, in_place=True)) # The subtitles should be reindexed starting at start_index assert [sub.index for sub in reindexed_subs ] == list(range(start_index, start_index + len(input_subs))) # The subtitles should be sorted by start time expected_sorting = sorted(input_subs, key=lambda sub: sub.start) assert reindexed_subs == expected_sorting
def test_sort_and_reindex(input_subs, start_index): for sub in input_subs: # Pin all subs to same end time so that start time is compared only sub.end = timedelta(1) reindexed_subs = list( srt.sort_and_reindex( input_subs, start_index=start_index, in_place=True, ), ) # The subtitles should be reindexed starting at start_index eq([sub.index for sub in reindexed_subs], list(range(start_index, start_index + len(input_subs)))) # The subtitles should be sorted by start time expected_sorting = sorted(input_subs, key=lambda sub: sub.start) eq(reindexed_subs, expected_sorting)
def test_sort_and_reindex(input_subs, start_index): for sub in input_subs: # Pin all subs to same end time so that start time is compared only, # must be guaranteed to be < sub.start, see how # start_timestamp_strategy is done sub.end = timedelta(500001) reindexed_subs = list( srt.sort_and_reindex(input_subs, start_index=start_index, in_place=True) ) # The subtitles should be reindexed starting at start_index eq( [sub.index for sub in reindexed_subs], list(range(start_index, start_index + len(input_subs))), ) # The subtitles should be sorted by start time expected_sorting = sorted(input_subs, key=lambda sub: sub.start) eq(reindexed_subs, expected_sorting)
def test_sort_and_reindex(input_subs, start_index): for sub in input_subs: # Pin all subs to same end time so that start time is compared only sub.end = timedelta(1) reindexed_subs = list( srt.sort_and_reindex( input_subs, start_index=start_index, in_place=True, ), ) # The subtitles should be reindexed starting at start_index eq( [sub.index for sub in reindexed_subs], list(range(start_index, start_index + len(input_subs))) ) # The subtitles should be sorted by start time expected_sorting = sorted(input_subs, key=lambda sub: sub.start) eq(reindexed_subs, expected_sorting)
def preprocess_subs(contents): generator = srt.parse(contents) return list(srt.sort_and_reindex(generator))
sub_n_translated = [] translator = Translator() for s in sub_n: s_translated = translator.translate(s, src=original_language, dest=language_to_translate) sub_n_translated.append(s_translated.text) print('.') sub_translated = [] for s in sub_n_translated: sub_translated.extend(s.split('\n')) sub = srt.parse(subtitle) new_subititle = [] for i, s in enumerate(list(sub)): s.content = sub_translated[i] new_subititle.append(s) new_subititle = list(srt.sort_and_reindex(new_subititle)) file = open(srt_translated, 'w') file.write(srt.compose(new_subititle)) file.close() print(srt_translated)
def transform(self, *args): self.srt_list = self.function(self.srt_list, *args) self.srt_list = list(srt.sort_and_reindex(self.srt_list))
files = extractAudio(args.input, args.temp_dir, smoothing_window=args.silence_window, weight=args.silence_weight) # Init the stt stt = STTPipeline(args.model_dir) # Start transcribing print("Transcribing...") subs = [] for w_file in tqdm(files): start, end, transcription = process_audio(w_file, stt) if len(transcription.strip()) == 0: continue subs.append( srt.Subtitle(0, datetime.timedelta(seconds=float(start)), datetime.timedelta(seconds=float(end)), transcription)) # write output print(f"Writing subtitle file to {args.output}...") with open(args.output, "w", encoding="utf-8") as f: srt.sort_and_reindex(subs, in_place=True) f.write(srt.compose(subs)) print("Removing temporary files...") shutil.rmtree(args.temp_dir)
def parse_srt(settings, file, summary, dry_run, quiet, verbose): if dry_run or verbose: print("Parsing '{0}'...".format(file)) try: original_subtitles = None with open(file, "r", encoding="utf-8") as filehandler: original_subtitles = filehandler.read() except: print() print("Couldn't open file '{0}'".format(file)) return False try: original_subtitles = list(srt.parse(original_subtitles)) except: print() print("Trouble parsing subtitles in '{0}'".format(file)) return False new_subtitle_file = [] new_subtitle = None removed_line_count = 0 modified_line_count = 0 for i in range(len(original_subtitles)): original_subtitle_text = original_subtitles[i].content new_subtitle = srt.Subtitle( i, start=original_subtitles[i].start, end=original_subtitles[i].end, content=original_subtitles[i].content, proprietary=original_subtitles[1].proprietary, ) line_history = [] for rule in settings["rules"]: if new_subtitle is None: break if "only_if_match" in rule: if not fnmatch.fnmatch(file, rule["only_if_match"]): continue line_before_rule_run = new_subtitle.content if rule["type"] == "regex": if rule["action"] == "replace": new_subtitle.content = re.sub( rule["pattern"], rule["value"], new_subtitle.content, re.MULTILINE, ) elif rule["action"] == "delete": if re.findall(rule["pattern"], new_subtitle.content, re.MULTILINE): new_subtitle = None elif rule["type"] == "string": if rule["action"] == "replace": new_subtitle.content.replace(rule["pattern"], rule["value"]) elif rule["action"] == "delete": if new_subtitle.content.find(rule["pattern"]) == -1: new_subtitle = None if new_subtitle is None: line_history.append(rule["name"]) elif new_subtitle.content != line_before_rule_run: line_history.append(rule["name"]) if new_subtitle is not None: if new_subtitle.content != "": new_subtitle_file.append(new_subtitle) if new_subtitle.content != original_subtitle_text: modified_line_count += 1 if verbose: if not quiet: print() print("{0}".format( wrap_sub(original_subtitle_text, "-"))) print("{0}".format(wrap_sub(new_subtitle.content, "+"))) print("|By rule(s): {0}".format(", ".join( map(str, line_history)))) else: removed_line_count += 1 if verbose: if not quiet: print() print("{0}".format(wrap_sub(original_subtitle_text, "-"))) print("|By rule: {0}".format(line_history[-1])) if not dry_run: new_subtitle_file = list(srt.sort_and_reindex(new_subtitle_file)) if (modified_line_count != 0 or removed_line_count != 0 or new_subtitle_file != original_subtitles): print() if modified_line_count == 0 and removed_line_count == 0 and not quiet: print( "Only changes to sorting and indexing found; No changes to subtitles detected." ) if not quiet or verbose: print("Saving subtitle file {0}...".format(file)) print() with open(file, "w", encoding="utf-8") as filehandler: filehandler.write(srt.compose(new_subtitle_file)) else: if not quiet or verbose: print("No changes to save") print() if summary or verbose: if dry_run: if verbose: print() print( "Summary: {0} Lines to be modified; {1} Lines to be removed; '{2}'" .format(modified_line_count, removed_line_count, file)) else: print( "Summary: {0} Lines modified; {1} Lines removed; '{2}'".format( modified_line_count, removed_line_count, file)) print() return True
def merge_subs(all_file_list): mergesub = list() filecount = 0 for file_name in all_file_list: ftype = os.path.splitext(file_name)[1] if ftype == r".srt": filecount = filecount + 1 with open(file_name,"r",encoding="utf-8") as fileread: mergesub_list = list(srt.parse(fileread,ignore_errors=False)) if filecount == 1: for i in range (len(mergesub_list)): mergesub.append(mergesub_list[i]) def langfixed(subtitle_1,subtitle_2): for i in range (len(subtitle_2)): for j in range (len(subtitle_1)): startflag = subtitle_1[j].start endflag = subtitle_1[j].end start = subtitle_2[i].start end = subtitle_2[i].end if j == 0: if start <= startflag and startflag < end < endflag: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if start <= startflag and endflag <= end: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if startflag < start < endflag and endflag <= end < mergesub[j+1].start: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if startflag < start < endflag and endflag <= end: if end - mergesub[j+2].start > timedelta(microseconds=0): mergesub[j+1].content = mergesub[j+1].content + "\n" + mergesub_list[i].content break if endflag - start > end - mergesub[j+1].start: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break else: mergesub[j+1].content = mergesub[j+1].content + "\n" + mergesub_list[i].content break if endflag - start == end - mergesub[j+1].start: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if startflag <= start and end <= endflag: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if start < startflag and end < startflag: mergesub.insert(j,mergesub_list[i]) break else: if mergesub[j-1].end < start <= startflag and startflag < end < endflag: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if start <= startflag and endflag <= end: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if startflag < start < endflag and endflag <= end < mergesub[j+1].start: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if startflag < start < endflag and endflag <= end: if j+2 < len(mergesub): if end - mergesub[j+2].start > timedelta(microseconds=0): mergesub[j+1].content = mergesub[j+1].content + "\n" + mergesub_list[i].content break if j+1 < len(mergesub): if endflag - start > end - mergesub[j+1].start: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break else: mergesub[j+1].content = mergesub[j+1].content + "\n" + mergesub_list[i].content break if endflag - start == end - mergesub[j+1].start: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if startflag <= start and end <= endflag: mergesub[j].content = mergesub[j].content + "\n" + mergesub_list[i].content break if start < startflag and end < startflag: mergesub.insert(j,mergesub_list[i]) break else: if len(mergesub_list) > len(mergesub): langfixed(mergesub,mergesub_list) else: langfixed(mergesub_list,mergesub) merge = list(srt.sort_and_reindex(mergesub)) total_file = open("merge.srt","w",encoding="utf-8") total_file.writelines(srt.compose(merge)) total_file.close()