def test_invalid_file(self): with self.assertRaises(InvalidSubtitleTypeError): parser.parse("test.py") with self.assertRaises(InvalidSubtitleTypeError): parser.parse(PATH.format(test_type="valid", subtitle_type="srt"), subtitle_type="py")
def subtitles_parser(subtitle_file, splits=1, set_directory="."): subs = psparser.parse(subtitle_file) try: lines = [line.text for line in subs] except UnicodeDecodeError: subs = psparser.parse(subtitle_file, encoding="ISO-8859-1") lines = [line.text for line in subs] text = " ".join(lines) generate_sets(text, splits, set_directory)
def _get_raw_subtitles_for_video(video_path, mkv_streams): dir_name = dirname(video_path) file_name = basename(video_path) video_ext = splitext(video_path)[1] target_episode_number = float(_get_episode_no(file_name)) # do the loop the first time without checking if we match the target # episode number this is so we deal with all the "failed to extract # the episode number" issues upfront (so "initialising" it) if video_path not in _has_initialised_path: _has_initialised_path[video_path] = None for root, _, file_names in walk(dir_name): for file_name in file_names: file_ext = splitext(file_name)[1][1:] if file_ext not in _valid_sub_exts: continue _get_episode_no(file_name) for root, _, file_names in walk(dir_name): for file_name in file_names: file_ext = splitext(file_name)[1][1:] if file_ext not in _valid_sub_exts: continue episode_number = float(_get_episode_no(file_name)) if episode_number == target_episode_number: subtitle_path = join(root, file_name) return parser.parse(subtitle_path) if video_ext == '.mkv': subtitles = (None, 0) for i, (lang, ext) in enumerate(mkv_streams[1]): if ext == 'subrip': ext = 'srt' subtitle_path = join(dir_name, f'({i}) {file_name}.{ext}') run(f'{_get_ffmpeg()} -i "{video_path}" -y -map 0:s:{i} "{subtitle_path}"', stdout=PIPE, stderr=PIPE) parsed_subtitles = list(parser.parse(subtitle_path)) no_subtitles = len(parsed_subtitles) remove(subtitle_path) if no_subtitles > subtitles[1]: subtitles = (parsed_subtitles, no_subtitles) return subtitles[0]
def test_invalid_timestamps(self, subtitle_type): path = PATH.format(test_type="invalid_timestamps", subtitle_type=subtitle_type) subtitles = parser.parse(path, subtitle_type=subtitle_type, fps=24) with self.assertRaises(InvalidTimestampError): next(subtitles)
def match_file_with_lang(sub_files, lang_iso): # Match based on the full language name & alpha_3 def _sublist_of_generator(gen, start, end): ls = [] try: for i in range(start): next(gen) for i in range(end - start): ls.append(next(gen)) except: pass return ls for f in sub_files: sub_basename = os.path.basename(f).lower() if lang_iso.name.lower() in sub_basename: return f if sub_basename.rsplit('.', 1)[0].endswith(lang_iso.alpha_3): return f # Match based on the content of the text #print("Search %s using content of : %s" % (lang_iso.name, sub_files)) for f in sub_files: # Get 30 subtitles in the middle subtitles = [ s.text.strip() for s in _sublist_of_generator(SubParser.parse(f), 10, 41) ] detected_lang = langdetect.detect('. '.join(subtitles)) if detected_lang == lang_iso.alpha_2: return f return None
def parse_subtitles(subtitle_file: str) -> List[Subtitle]: """ Return a list of Subtitle objects derived from the given file """ with open(subtitle_file, 'rb') as f: chardet_result = chardet.detect(f.read()) return [ Subtitle(s.index, s.text, _to_sec(s.start), _to_sec(s.end)) for s in subparser.parse(subtitle_file, encoding=chardet_result['encoding']) ]
def test_valid_subtitles(self, subtype): path = PATH.format(test_type='valid', subtype=subtype) subtitles = parse(path, subtype=subtype, fps=24) self.validate(sub=next(subtitles), index=0, text='Subtitle', clean='subtitle', start=(0, 0, 1), end=(0, 0, 2), duration=1000) self.validate(sub=next(subtitles), index=1, text='- Subtitle', clean='subtitle', start=(0, 0, 3), end=(0, 0, 3), duration=500) self.validate(sub=next(subtitles), index=2, text='[Sound effect] Subtitle', clean='subtitle', start=(0, 1, 5), end=(0, 1, 5), duration=250) self.validate(sub=next(subtitles), index=3, text='<format>Subtitle</format>', clean='subtitle', start=(1, 30, 0), end=(1, 35, 0), duration=300000) self.validate(sub=next(subtitles), index=4, text='Multi line Subtitle', clean='multi line subtitle', start=(2, 0, 0), end=(2, 11, 11), duration=671000) self.validate(sub=next(subtitles), index=5, text='Subtitle', clean='subtitle', start=(2, 20, 0), end=(3, 0, 0), duration=2400000)
def parse(filename: str) -> List[Subtitle]: encoding = SubtitleParser._detect_encoding(filename) raw_subtitles = parser.parse(filename, encoding=encoding) raw_subtitles = formatting.clean(raw_subtitles) subtitle_entities = [] for raw_subtitle in raw_subtitles: subtitle_entity = Subtitle(quote=raw_subtitle.text, start_time=raw_subtitle.start, end_time=raw_subtitle.end) subtitle_entities.append(subtitle_entity) return subtitle_entities
def test_valid_subtitles(self, subtitle_type): path = PATH.format(test_type="valid", subtitle_type=subtitle_type) subtitles = parser.parse(path, subtitle_type=subtitle_type, fps=24) self._assert_subtitle(sub=next(subtitles), index=0, text="Subtitle", start=(0, 0, 1), end=(0, 0, 2), duration=1000) self._assert_subtitle(sub=next(subtitles), index=1, text="- Subtitle", start=(0, 0, 3), end=(0, 0, 3), duration=500) self._assert_subtitle(sub=next(subtitles), index=2, text="[Sound effect] Subtitle", start=(0, 1, 5), end=(0, 1, 5), duration=250) self._assert_subtitle(sub=next(subtitles), index=3, text="<format>Subtitle</format>", start=(1, 30, 0), end=(1, 35, 0), duration=300000) self._assert_subtitle(sub=next(subtitles), index=4, text="Multi line Subtitle", start=(2, 0, 0), end=(2, 11, 11), duration=671000) self._assert_subtitle(sub=next(subtitles), index=5, text="Subtitle", start=(2, 20, 0), end=(3, 0, 0), duration=2400000)
def cli(sub_file, output_manifest_file): subtitles = parser.parse(sub_file.name) output_json = [] for subtitle in subtitles: start_time = get_microseconds(subtitle.start) end_time = get_microseconds(subtitle.end) length = end_time - start_time length_in_seconds = math.ceil(length / 1000_000) start_time_in_seconds = math.floor(start_time / 1000_000) clip_filename = f"{slugify(subtitle.text)}-{start_time_in_seconds}.mp4" output_json.append( { "start_time": start_time_in_seconds, "length": length_in_seconds, "rename_to": clip_filename, "title": subtitle.text, } ) json.dump(output_json, output_manifest_file, indent=4, sort_keys=True)
from pysubparser import parser import freqlist import MeCab #put there absolute path of the file with double slashes eg 'E:\PythonProjects\pythonProject1\Anime frec list\[Kamigami] Barakamon - 01 [1280×720 x264 AAC Sub(Chs,Jap)].ass' sub_file = 'put there path of the file' subtitles = parser.parse('E:\\PythonProjects\\pythonProject1\\Anime frec list\\[Kamigami] Barakamon - 01 [1280×720 x264 AAC Sub(Chs,Jap)].ass') wakati = MeCab.Tagger("-Owakati") for subtitle in subtitles: subline = subtitle #print(subtitle.text) line_splitted = (wakati.parse(subtitle.text).split()) freqlist. print(line_splitted)
def test_invalid_file(self): with self.assertRaises(InvalidSubtitleTypeError): parse('test.py') with self.assertRaises(InvalidSubtitleTypeError): parse(PATH.format(test_type='valid', subtype='srt'), subtype='py')
def test_invalid_encoding(self): with self.assertRaises(UnicodeDecodeError): list( parse(PATH.format(test_type='invalid_encoding', subtype='srt'), encoding='ascii'))
from pysubparser import parser import jieba import sys import csv tsv_file = open("Chinese.txt") read_tsv = csv.reader(tsv_file, delimiter="\t") word_count = {} ignore_list = ['》', '《'] for row in read_tsv: w = row[1] ignore_list.append(w) for filename in sys.argv[1:]: subtitles = parser.parse(filename) for subtitle in subtitles: seg_list = jieba.cut(subtitle.text, cut_all=False) for word in seg_list: if word not in ignore_list: word_count[word] = word_count.get(word, 0) + 1 sorted_words = sorted(word_count.items(), key=lambda kv: kv[1]) for w, count in sorted_words[-101:-1]: print(w + " -> " + str(count))
def setUp(self) -> None: self.subtitles = parse("./tests/files/valid/cleaners.srt")
def getSeconds(timeObj): return float((timeObj.hour * 60 + timeObj.minute) * 60 + timeObj.second) def callSystem(time, index): command = 'echo "python3 ' + str(filepath) + '/post.py ' + str(index) + '"' at = ' | at -M ' + str(time.strftime('%H:%M %Y-%m-%d')) print(command + at) #os.system(command + at) config = configparser.ConfigParser() config.read(str(filepath) + '/config.ini') filename = config['DEFAULT']['filename'] subtitlesGen = parser.parse('./' + filename) subtitles = [] # convert Generator into a list because # I'm not sure how else to accomplish thi for subtitle in subtitlesGen: subtitles.append(subtitle) startTime = datetime.datetime.strptime('00:00:00.000000', "%H:%M:%S.%f").time() endTime = subtitles[len(subtitles) - 1].end TOTAL_SECONDS_IN_ONE_YEAR = float(60 * 60 * 24 * 365) secondsInMovie = getSeconds(endTime) movieSecondMultiplier = TOTAL_SECONDS_IN_ONE_YEAR / secondsInMovie #pseudoNow is sixty seconds in the future, to allow time for the script to run
import streamlit as st import pandas as pd import spacy st.sidebar.title('Views') st.text_input('Type a word', value='', max_chars=None, key=None, type='default') from pysubparser import parser subtitles = parser.parse('top gun-English.sub') for subtitle in subtitles: st.write(subtitle.text)
def __init__(self, path_to_subtitle_file): self._subtitles = parser.parse(path_to_subtitle_file)
def test_srt_writer(self, subtitle_type): path = PATH.format(test_type="valid", subtitle_type=subtitle_type) subtitles = parser.parse(path, subtitle_type=subtitle_type, fps=24) subtitles = brackets.clean( formatting.clean( lower_case.clean( ascii.clean( subtitles ) ) ) ) new_path = f"{path}.srt" writer.write(subtitles, new_path) subtitles = parser.parse(new_path, subtitle_type="srt") self._assert_subtitle( sub=next(subtitles), index=0, text="subtitle", start=(0, 0, 1), end=(0, 0, 2), duration=1000 ) self._assert_subtitle( sub=next(subtitles), index=1, text="- subtitle", start=(0, 0, 3), end=(0, 0, 3), duration=500 ) self._assert_subtitle( sub=next(subtitles), index=2, text="subtitle", start=(0, 1, 5), end=(0, 1, 5), duration=250 ) self._assert_subtitle( sub=next(subtitles), index=3, text="subtitle", start=(1, 30, 0), end=(1, 35, 0), duration=300000 ) self._assert_subtitle( sub=next(subtitles), index=4, text="multi line subtitle", start=(2, 0, 0), end=(2, 11, 11), duration=671000 ) self._assert_subtitle( sub=next(subtitles), index=5, text="subtitle", start=(2, 20, 0), end=(3, 0, 0), duration=2400000 )
from pysubparser import parser subtitles = parser.parse( 'Anime frec list//[Kamigami] Barakamon - 01 [1280×720 x264 AAC Sub(Chs,Jap)].ass' ) # outputs springs for subtitle in subtitles: print(subtitle.text)
import configparser #from os import path import sys from pysubparser import parser import pathlib import os filepath = pathlib.Path(__file__).parent.absolute() config = configparser.ConfigParser() config.read(str(filepath) + '/config.ini') filename = config['DEFAULT']['filename'] if len(sys.argv) < 2: quit() indexToSend = int(sys.argv[1]) subtitles = parser.parse(str(filepath) + '/' + filename, 'srt') api = twitter.Api(consumer_key=config['DEFAULT']['consumer_key'], consumer_secret=config['DEFAULT']['consumer_secret'], access_token_key=config['DEFAULT']['access_token_key'], access_token_secret=config['DEFAULT']['access_token_secret']) subtitleToSend = '' for subtitle in subtitles: if subtitle.index == indexToSend: subtitleToSend = subtitle.text break api.PostUpdates(status=subtitleToSend)
def test_invalid_encoding(self): with self.assertRaises(UnicodeDecodeError): path = PATH.format(test_type="invalid_encoding", subtitle_type="srt") list(parser.parse(path, encoding="ascii"))
for sub in seg: print(sub.text) print(seg[-1].end) print("------------------------------------") print("Segment duration: " + str((time_to_millis(seg[-1].end) - time_to_millis(seg[0].start)) / 1000)) print("====================================") audio_filename = sys.argv[1] subtitle_filename = sys.argv[2] subtitles = parser.parse(subtitle_filename) segments = get_segments(subtitles) song = AudioSegment.from_mp3(audio_filename) folder = "out/" episode = "e01" n = 1 for seg in segments: start = time_to_millis(seg[0].start) - 1000 end = time_to_millis(seg[-1].end) + 1500 cut = song[start:end] cut.export(folder + episode + "_seg" + str(n) + ".mp3", format="mp3") print("===== Segment " + str(n) + " ========")