def main(argv): if len(argv) != 4: print 'usage:' print ' python subtitle.py input-video.mp4 input-subtitle.srt output-video.avi' return input_video_fname = argv[1] input_subt_fname = argv[2] output_video_fname = argv[3] subt = Subtitle() subt.render(input_video_fname, input_subt_fname, output_video_fname)
def get_subtitles(file_rows: list) -> list: result = [] timeline_raw = '' subtitle_raw = '' for i, file_row in enumerate(file_rows): if Subtitle.is_timeline(file_row): timeline_raw = file_row if Subtitle.is_subtitle_string(file_row): subtitle_raw = file_row if timeline_raw != '' and subtitle_raw != '': result.append(Subtitle(SubtitleString(subtitle_raw), TimeLine(timeline_raw))) timeline_raw = '' subtitle_raw = '' return result
def search_movie_subtitle(movie, language): """find subtitle for a movie base on assigned language if a language has not be assigned 'english' will be supplied by the Movie class Args: movie (Movie): The movie whose subtitle is to be searched. language (str): The subtitle language. """ url = 'https://subscene.com/subtitles/' + movie.slug() + '/' + language response = requests.get(url) response.raise_for_status() soup = bs4.BeautifulSoup(response.text, "html.parser") del response subtitles = [] selection = soup.findAll('td', attrs={'class': 'a1'}) for td in selection: title = str(td.findAll( 'span', attrs={'class': None})[0].contents[0]).encode('utf-8').strip() url = str(td.find('a')['href']).strip() lang = str(td.find('span').contents[0]).encode('utf-8').strip() temp = Subtitle(title, url, lang) subtitles.append(temp) return subtitles
def test_html_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() #sub_scan.addFile('https://www.coursera.org/') sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32) pass
def get_timelines(input_file: str) -> list: res = [] file_rows = FileManager.get_file_rows(input_file) for i, file_row in enumerate(file_rows): if Subtitle.is_timeline(file_row): res.append(file_row) return res
def move_subtitles(subtitles, seconds, filename): for sub in subtitles: assert isinstance(sub, Subtitle) start_time = sub.get_start_second() end_time = sub.get_end_second() start_time += seconds end_time += seconds if start_time < 0 or end_time < 0: print("TIMESTAMP OUT OF RANGE!!!") return sub.start_time = Subtitle.to_srt_timestamp(start_time) sub.end_time = Subtitle.to_srt_timestamp(end_time) Subtitle.to_srt_file(subtitles, filename) print('saved to', filename)
def setupMediaExtractor(self, i): movie_name = self.clients[i]['movie_name'] self.clients[i]['video_extractor'] = VideoCapturer(movie_name) video_extractor = self.clients[i]['video_extractor'] fps = video_extractor.fps frame_count = video_extractor.frame_count self.clients[i]['audio_extractor'] = AudioCapturer( movie_name, fps, frame_count) if self.clients[i]['subtitle_file'] is not None: self.clients[i]['subtitle'] = Subtitle( frame_count, fps, self.clients[i]['subtitle_file'])
def load_subtitle(path, aqt_file): print "Loading %s%s" % (path, aqt_file) subtitles = [] with open('%s%s' % (path, aqt_file), 'r') as f: while True: lines = read_next_lines(f, 4) if lines: sub = Subtitle(lines[1], lines[0], lines[2]) subtitles.append(sub) else: break return subtitles
def scan_subtitle(path): if not os.path.exists(path): raise ValueError('Path does not exist') dirpath, filename = os.path.split(path) logger.info('Scanning subtitle %r in %r', filename, dirpath) # guess parent_path = path.strip(filename) subtitle = Subtitle.fromguess(parent_path, guessit(path)) return subtitle
def main(in_subt, out_subt): assert in_subt != "" assert out_subt != "" parser = Parser() normalizer = Normalizer() lemma_filter = Filter() try: f = codecs.open(in_subt, 'r', encoding='utf8') text = f.read() f.close() except IOError: sys.exit("The subtitle could not be found in the path you provided.") parser.parse(text) normalizer.normalize(parser.get_text()) lemma_filter.clean_lemmas(normalizer.get_lemmas()) new_sub = Subtitle(parser.get_indexes(), parser.get_times(), parser.get_text(), lemma_filter.get_final_lemmas(), lemma_filter.get_dict(), out_subt) new_sub.create_subtitle()
def test_txt_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518) ''' assert len(sub_scan.lexicon) == 3929 assert len(sub_scan.stem_lexicon) == 2968 assert len(sub_scan.wordSet) <= 1807 assert len(sub_scan.stem_newWords) <= 922 assert len(sub_scan.newWords) <= 531 ''' names_mv=['Yuri', 'Simeon'] for n in names_mv: assert n in sub_scan.nameSet ''' if n in sub_scan.nameSet: print n; pass ''' pass #print sub_scan.nameSet for n in names_mv: ''' if n.lower() in sub_scan.newWords: print n; pass ''' assert n.lower() not in sub_scan.newWords pass pass
def fix(self): caption_rows = FileManager.get_file_rows(self.input_filename) for i, caption_row in enumerate(caption_rows): if Subtitle.is_timeline(caption_row): if i + 3 < len(caption_rows): timeline_splitted = caption_row.split(',') timeline_splitted_next = caption_rows[i + 3].split(',') HandleFunctions.save_str_to_file( self.output_filename, timeline_splitted[0] + ',' + timeline_splitted_next[0] + '\n') else: HandleFunctions.save_str_to_file(self.output_filename, caption_row) else: HandleFunctions.save_str_to_file(self.output_filename, caption_row)
def __init__(self, logger=None, log_level=logging.INFO): classname = type(self).__name__ # print("classname: ", classname) if logger is None: self.logger = create_log(log_name=classname, level=log_level) else: self.logger = logger self.logger.info("\n-----------------") self.logger.info("Begin to init") self.logger.info("\n-----------------") self.sub = Subtitle(self.logger) self.sub.set_parse(True) self.files = dict() self.filetypes = ['srt', 'bak', 'm3u', 'txt'] self.vediotypes = ['mkv', 'mp4', 'avi'] self.rmsrt = False if self.sub.lexicon_path is None: self.sub.set_lexicon_file("lexicon/lexicon.xlsx") pass
def setUp(self): self.sub = Subtitle(loglevel=logging.DEBUG) pass
def read_next_subtitle(self): subtitle = Subtitle() subtitle.identifier = self._read_identifier() subtitle.timestamp_begin, subtitle.timestamp_end = self._read_timestamps() subtitle.text = self._read_text() return subtitle
def import_subtitle(self, file:Path) -> Subtitle: return Subtitle.new(file, self.imdbid)
def remove_timelines(input_file: str, output_file: str): file_rows = FileManager.get_file_rows(input_file) for i, file_row in enumerate(file_rows): if not Subtitle.is_timeline(file_row): HandleFunctions.save_str_to_file(output_file, file_row)
def main(argv=None, log_ger=None): if log_ger is None: log_ger = create_log(log_name="subtitle", level=logging.INFO) fname = None start_dtime = datetime.now() # print("Start time: "+str(start_dtime))#.strftime("%Y-%m-%d %H:%M:%S")) print() # sub=Subtitle(logging.getLogger()) sub = Subtitle(log_ger) try: opts, args = getopt.getopt( argv, "hvf:w:t:d:e:p:s:b:?lm:WDc", ["help", "version", "parse", "checkup" "file=", "word=", "type=", "dir=", "pickle=", "limit=", "section=", "bigger="]) # print opts, args log_ger.info("opts:{0};args:{1}".format(opts, args)) except getopt.GetoptError as msg: print("error happened when get options!!! error:{0}".format(msg)) usage() log_ger.error("getopt.GetoptError:{0}, exit!".format(msg)) sys.exit(2) except Exception as msg: log_ger.error("error:{0}, exit!".format(msg)) sys.exit(2) _is_lines_show = False _is_words_show = False sub_type = "" words_limit = None for opt, arg in opts: if opt in ("-?", "-h", "--help"): usage() sys.exit() pass elif opt in ("-v", "--version"): version() sys.exit() pass elif opt in ("-b", "--bigger"): sub.set_times_bigger(int(arg)) pass elif opt in ("-c", "--checkup"): sub.checkup = True pass elif opt in ("-d", "--dir"): print("Sorry, -d --dir option still not offer") sys.exit() pass elif opt in ("-e", "--excel"): pkl = arg sub.set_lexicon_file(pkl) pass elif opt in ("-s", "--section"): if ',' in arg: section = arg.split(',') if len(section) == 2: # print(section) start, end = section if len(start) != 0: sub.set_start(int(start)) if len(end) != 0: sub.set_end(int(end)) # print(start, end) else: print("something wrong, with option -s --section:", arg) sys.exit() else: print("something wrong, with option -s --section:", arg) sys.exit() pass elif opt in ('-f', "--file"): fname = arg sub.add_file(fname) pass elif opt in ('-p', "--parse"): sub.set_parse(True) pass elif opt == '-D': log_ger.setLevel(logging.DEBUG) sub.set_logger(log_ger) sub.set_debug(True) pass elif opt in ("-w", "--word"): word = arg sub.add_word(word) # 多用于测试,放弃写入 sub.set_output(False) pass elif opt in ("-t", "--type"): sub_type = arg if sub_type not in ('save', 'scan', 'cloud'): usage() sys.exit() pass pass elif opt in ("-m", "--limit"): words_limit = int(arg) # print words_limit _is_words_show = True pass elif opt == '-l': # show lines _is_lines_show = True pass elif opt == '-W': # show words _is_words_show = True pass """ if(len(sys.argv)<2): print "need args!!" log_ger.error("need args!!sys.argv:{0}".format(sys.argv)) return None pass """ # print sys.argv # sub.add_punctuation([',','!',';','.',':','>','<']) # sub.addLexicon(["hello", "world"]) if sub.lexicon_path is None: sub.set_lexicon_file("lexicon/lexicon.xlsx") sub.load_old_data() sub.add_files(args) # sub.add_strings("hello world, I'm wang. Please call me wang.") sub.check_all(encode='utf-8') if _is_lines_show: sub.lines_show(words_limit) pass if _is_words_show: # print words_limit sub.words_show(words_limit) pass sub.show() if sub_type == 'save': sub.dump_data() elif sub_type == 'cloud': sub.cloud() print() end_dtime = datetime.now() # print("End time: "+str(end_dtime)) timedelta = end_dtime - start_dtime print("Cost time: " + str(timedelta)) # getChecksum(sys.argv[1]) pass
class TransferBak(object): """ # save srt to srt.bak # transfer all srt.bak files # mv wordch.srt to srt # create m3u from sub srt # create and cat word.txt """ def __init__(self, logger=None, log_level=logging.INFO): classname = type(self).__name__ # print("classname: ", classname) if logger is None: self.logger = create_log(log_name=classname, level=log_level) else: self.logger = logger self.logger.info("\n-----------------") self.logger.info("Begin to init") self.logger.info("\n-----------------") self.sub = Subtitle(self.logger) self.sub.set_parse(True) self.files = dict() self.filetypes = ['srt', 'bak', 'm3u'] self.vediotypes = ['mkv', 'mp4', 'avi'] self.rmsrt = False self.is_m3u_only = False if self.sub.lexicon_path is None: self.sub.set_lexicon_file("lexicon/lexicon.xlsx") pass def setrm(self): self.rmsrt = True def set_debug(self): self.logger.setLevel(logging.DEBUG) self.sub.set_logger(self.logger) self.sub.set_debug(True) def set_m3u_only(self): self.is_m3u_only = True def set_logger(self, logger): self.logger = logger pass # save srt to srt.bak def srt_to_baks(self): for i in self.files["srt"]: # mv srt to srt.bak srt = os.path.join(i[0], i[1]) bak = srt + ".bak" base = os.path.splitext(srt)[0] if self.get_vedio_num() <= 0 : # 无视频的情况 print("move " + srt + " to " + bak) shutil.move(srt, bak) pass else: for postfix in self.vediotypes: vedio = '' if base.endswith(".en"): vedio = base[0:-3] + "." + postfix bak = base[0:-3] + ".srt.bak" else: vedio = base + "." + postfix if os.path.exists(vedio) and not os.path.exists(bak): print("move " + srt + " to " + bak) shutil.move(srt, bak) pass # get srt.bak file from mkv! def baks_mkv(self): failedfiles = [] print("------------") print('get srt.bak file from vedio!') for postfix in self.vediotypes: if postfix in self.files: for fi, f in enumerate(self.files[postfix]): mkvfile = os.path.join(f[0], f[1]) print("------------") print(fi, mkvfile) srtbak_path = os.path.splitext(mkvfile)[0] + ".srt.bak" if os.path.exists(srtbak_path): print(srtbak_path, "exists") print("------------") continue print("1. Check streams in mkv!") res = subprocess.run(["ffmpeg", "-i", mkvfile], stderr=subprocess.PIPE) """ print(res) print("------------") print(res.stderr) print(res.stdout) print(res.args) print("------------over") """ beg = False streams = [] stream = None for i, v in enumerate(res.stderr.decode("utf-8").split("\n")): if "Stream" in v: beg = True m = re.match(r" *Stream #(\d+):(\d+)(\(\w+\))?: (\w+): (\w+)", v) if m: # print(m.groups()[1:]) # c if stream is not None: streams.append(stream) stream = {"id": m.group(2), "type": m.group(4), "filetype": m.group(5)} if m.group(3) is not None: # subhead 副标题 stream['subhead'] = m.group(3) continue else: print("err: reg not work on this sentence") print(v) sys.exit(2) elif not beg: continue else: m = re.match(r" *title *: (.*)", v) if m: stream['title'] = m.group(1) pass pass streams_df = pd.DataFrame(streams) print("streams_df:\n", streams_df) # c # print(streams_df.dtypes) # c if len(streams_df) < 2 or len(streams_df[streams_df["type"] == "Subtitle"]) <= 0: print("not have enough stream") failedfiles.append(mkvfile) continue print("2. get srt.bak file from mkv!") engchs = [] if 'title' in streams_df: criterion = streams_df['title']. \ map(lambda t: re.match(r"(英.中)|(英中)|(中英字幕)", str(t)) is not None) engchs = streams_df[criterion] if len(engchs) > 0: # todo: deal engchs print("engchs:\n", engchs) # print( os.path.splitext(mkvfile)[0] + ".srt.bak") # c base = os.path.splitext(mkvfile)[0] srt = base + ".srt" if 'ass' == str(engchs.iloc[0].filetype): ass = base + ".ass" ass_file = open(ass) srt_str = asstosrt.convert(ass_file) f = open(srt, "w") f.write(srt_str) f.close() """ subprocess.run( ["mkvextract", "tracks", mkvfile, "{0}:{1}".format( str(engchs.iloc[0].id), ass ) ]) time.sleep(2) res = subprocess.run(["ffmpeg", "-i", ass, srt]) if os.path.getsize(srt) <= 10: print(res) print("------------") print(res.stderr) print(res.stdout) print(res.args) print("------------over") sys.exit(8) """ if os.path.exists(srt): shutil.move(srt, srtbak_path) else: # todo: log err print("error: {} failed to create!".format(srt)) failedfiles.append(mkvfile) pass elif 'subrip' == str(engchs.iloc[0].filetype): print("filetype:", str(engchs.iloc[0].filetype)) subprocess.run( ["mkvextract", "tracks", mkvfile, "{0}:{1}".format( str(engchs.iloc[0].id), srtbak_path ) ]) pass else: criterion = streams_df['subhead']. \ map(lambda t: re.match(r"eng", str(t)) is not None) eng = streams_df[ criterion & (streams_df.type == "Subtitle") ] if len(eng) > 0: # todo: deal eng print("eng:\n", eng) pass else: print('-------------') print(f[1], "has no suitable subtitle stream!") print(streams_df[streams_df.type == "Subtitle"]) failedfiles.append(mkvfile) print('-------------') pass print("files failed to create .srt.bak:\n") for f in failedfiles: print(f) print('-------------') pass # create and cat word.txt def words(self): pass def show_files(self): for k, v in self.files.items(): print(k + ": {}".format(len(v))) pass def check_types(self, path): self.files = dict() for root, dirs, files in os.walk(path): for file in files: for postfix in (self.filetypes + self.vediotypes): if file.endswith("." + postfix): if postfix not in self.files: self.files[postfix] = set() self.files[postfix].add((root, file)) break self.show_files() pass def rm_m3u(self, path): if 'm3u' in self.files: for f in self.files['m3u']: p = os.path.join(f[0], f[1]) os.remove(p) print(p + " was removed!") def rm_srt(self, path): if 'srt' in self.files: for f in self.files['srt']: p = os.path.join(f[0], f[1]) # 没有bak的情况下不要删除原始srt, 悲剧过了 if os.path.exists(p+".bak"): os.remove(p) print(p + " was removed!") self.rm_m3u(path) out = path+"/output.xlsx" if os.path.exists(out): os.remove(out) print(out + " was removed!") def get_vedio_num(self): num = 0 for postfix in self.vediotypes: if postfix in self.files: num += len(self.files[postfix]) return num def transfer_dir(self, path): self.logger.info("path: {}".format(path)) self.check_types(path) if self.rmsrt: self.rm_srt(path) print("\nafter remove str and m3u files:") self.check_types(path) old_bak = set() if 'bak' in self.files: old_bak = copy.deepcopy(self.files["bak"]) if self.get_vedio_num() > 0 and self.get_vedio_num() != len(old_bak): if "srt" in self.files and \ len(self.files["srt"]) > 0: self.srt_to_baks() pass print("------------") print("after move srt to bak") self.check_types(path) if "bak" in self.files: if self.get_vedio_num() > len(self.files["bak"]): print("There is more vedio than bak!") print("There is no enough .srt.bak!") self.baks_mkv() elif self.get_vedio_num() > 0: print("There is more vedio than bak(0)!") print("There is no enough .srt.bak!") self.baks_mkv() self.check_types(path) if "bak" in self.files and len(self.files["bak"]) == len(old_bak): print("baks num get no increase!") if 'bak' in self.files: old_bak = copy.deepcopy(self.files["bak"]) elif len(old_bak) > 0: print(".srt.bak exists!") pass elif self.get_vedio_num() <= 0 : # 有字幕,无视频的情况, 继续执行,背背单词 print("no .srt.bak at all!") if "srt" in self.files and \ len(self.files["srt"]) > 0: self.srt_to_baks() pass print("------------") print("after move srt to bak") self.check_types(path) if len(self.files["bak"]) == len(old_bak): print("baks num get no increase!") if 'bak' in self.files: old_bak = copy.deepcopy(self.files["bak"]) else: print("no .srt.bak at all!") pass srt_cre = [] if "srt" in self.files: srt_set = set([os.path.join(x[0], x[1]) for x in self.files["srt"]]) else: srt_set = set() if len(old_bak) > 0: for f in self.files["bak"]: bak = os.path.join(f[0], f[1]) bak_base = os.path.splitext(bak)[0] # will get *.srt # print(bak_base) if bak_base not in srt_set: srt_cre.append(bak) self.sub.load_old_data() self.sub.add_files(srt_cre) self.sub.check_all(encode='utf-8') # 在目标文件夹也放一份单词excel output_dir = "output/output.xlsx" if self.sub.words_len() > 0 and os.path.exists(output_dir): shutil.copy(output_dir, path) print("output.xlsx was copy to " + path) self.sub.show() failedfiles = [] srt_files = [] for f in srt_cre: base = os.path.splitext(f)[0] # will get *.srt print("base:", base) wordsrt = base + "_word.srt" srt = base print(wordsrt) if os.path.exists(wordsrt): shutil.move(wordsrt, srt) srt_files.append(srt) else: failedfiles.append(srt_cre) print("files failed to create .srt_word.srt:\n") for f in failedfiles: print(f) print('-------------') print("srt files created:\n") for f in srt_files: print(f) print('-------------') self.get_m3u(path) def get_m3u(self, path): self.check_types(path) old_m3u = set() if 'm3u' in self.files: old_m3u = copy.deepcopy(self.files['m3u']) else: old_m3u = [] for postfix in self.vediotypes: if postfix in self.files: for f in self.files[postfix]: vedio = os.path.join(f[0], f[1]) base = os.path.splitext(vedio)[0] # will get * without .mkv m3u = base + ".m3u" srt = base + ".srt" if not os.path.exists(m3u) and os.path.exists(srt): print(vedio, "is getting m3u...") set_m3u(vedio, self.logger) self.check_types(path) if 'm3u' in self.files and len(old_m3u) >= len(self.files['m3u']): print("old_m3u:\n", old_m3u) print("new_m3u:\n", self.files['m3u']) pass def transfer_paths(self, paths): self.logger.info("paths:{}".format(paths)) # print(paths) # for debug for p in paths: if os.path.isfile(p): print( "should be dir, don't input file:" + p) elif os.path.isdir(p): print("transfer dir:" + p) if self.is_m3u_only is True: self.check_types(p) self.rm_m3u(p) self.get_m3u(p) else: self.transfer_dir(p) else: print(p, "not exists!") pass
from turtle_graphics import TextMarker screen = turtle.Screen() width, height = screen.window_width(), screen.window_height() canvas = screen.getcanvas() left, top = 100, 100 geom = '{}x{}+{}+{}'.format(width, height, left, top) canvas.master.geometry(geom) screen.title("U.S. States Game | 0/50 ") image = "blank_states_img.gif" screen.addshape(image) turtle.shape(image) subtitle = Subtitle() states = States() states.new_game_memory() game_is_on = True while game_is_on: screen.update() sleep(0.1) # ADD TEXT TO SCREEN score = states.last_score() answer = screen.textinput(title=f"Guess a State | {score}/50 ", prompt=" What's another state's name?\n") # BREAK OUT ON EXIT if answer.title() == "Exit": states.states_to_learn()
import sys import os from subtitle import Subtitle SUB_PATH = "/media/data/mtriet/raw_video/%s/train" % (sys.argv[1]) FRAME_PATH = "/media/data/mtriet/dataset/scnn_%s_frames" % sys.argv[1] if __name__ == '__main__': if len(sys.argv) > 4 or len(sys.argv) < 3: print('command fb/bb front_pad rear_pad') sys.exit(0) for frame_root, sub_folder, _ in os.walk(FRAME_PATH): for folder in sub_folder: print(folder) subtitles = Subtitle.load_subtitle(SUB_PATH, folder, False) with open("%s/%s_pad.aqt" % (SUB_PATH, folder), 'w') as f: for s in subtitles: f.write(s.to_string(sys.argv[2], sys.argv[3]))
class TransferBak(object): """ # save srt to srt.bak # transfer all srt.bak files # mv wordch.srt to srt # create m3u from sub srt # create and cat word.txt """ def __init__(self, logger=None, log_level=logging.INFO): classname = type(self).__name__ # print("classname: ", classname) if logger is None: self.logger = create_log(log_name=classname, level=log_level) else: self.logger = logger self.logger.info("\n-----------------") self.logger.info("Begin to init") self.logger.info("\n-----------------") self.sub = Subtitle(self.logger) self.sub.set_parse(True) self.files = dict() self.filetypes = ['srt', 'bak', 'm3u', 'txt'] self.vediotypes = ['mkv', 'mp4', 'avi'] self.rmsrt = False if self.sub.lexicon_path is None: self.sub.set_lexicon_file("lexicon/lexicon.xlsx") pass def set_logger(self, logger): self.logger = logger pass def show_files(self): for k, v in self.files.items(): print(k + ": {}".format(len(v))) pass def clear_no_en(self, path): if os.path.isfile(path): print("should be dir, don't input file:" + path) return None elif os.path.isdir(path): print("transfer dir:" + path) else: print(path, "not exists!") return None self.files = dict() for root, dirs, files in os.walk(path): for file in files: if file.endswith(".en.srt"): if ".en.srt" not in self.files: self.files[".en.srt"] = set() self.files[".en.srt"].add((root, file)) for postfix in (self.filetypes + self.vediotypes): if file.endswith("." + postfix): if postfix not in self.files: self.files[postfix] = set() self.files[postfix].add((root, file)) break self.show_files() if ".en.srt" not in self.files or len(self.files[".en.srt"]) <= 0: return None self.rm_not_en(path) pass def rm_not_en(self, path): if 'srt' in self.files: for f in self.files['srt']: p = os.path.join(f[0], f[1]) if not p.endswith(".en.srt"): os.remove(p) print(p + " was removed!") elif p.endswith(".en.srt"): print("move " + p + " to " + p.replace(".en.srt", ".srt")) shutil.move(p, p.replace(".en.srt", ".srt")) if 'txt' in self.files: for f in self.files['txt']: p = os.path.join(f[0], f[1]) if not p.endswith(".en.txt"): os.remove(p) print(p + " was removed!")
def main(argv=None, logger=None): if(logger is None): logger=createLog(logname="subtitle",level=logging.INFO) fname=None startDtime=datetime.now() print "Start time: "+str(startDtime)#.strftime("%Y-%m-%d %H:%M:%S")) print #sub=Subtitle(logging.getLogger()) sub=Subtitle(logger) try: opts, args=getopt.getopt( argv, "hvf:w:t:d:p:?lm:WDc", ["help", "version", "checkup" "file=","word=","type=","dir=","pickle=","limit="]) #print opts, args logger.info("opts:{0};args:{1}".format(opts, args)) except getopt.GetoptError as msg: print "error happened when get options!!! error:{0}".format(msg) usage() logger.error("getopt.GetoptError:{0}, exit!".format(msg)) sys.exit(2) except Exception as msg: logger.error("error:{0}, exit!".format(msg)) sys.exit(2) _is_lines_show=False _is_words_show=False sub_type = "" words_limit=None for opt, arg in opts: if(opt in ("-?","-h", "--help")): usage() sys.exit() pass elif(opt in ("-v", "--version")): version() sys.exit() pass elif(opt in ("-c", "--checkup")): sub.checkup=True pass elif(opt in ("-d", "--dir")): print "Sorry, -d --dir option still not offer" sys.exit() pass elif(opt in ("-p", "--pickle")): pkl=arg sub.setLexiconFile(pkl) pass elif(opt in ('-f',"--file")): fname= arg sub.addFile(fname) pass elif(opt == '-D'): logger.setLevel(logging.DEBUG) sub.setLogger(logger) pass elif(opt in ("-w", "--word")): word = arg sub.addWord(word) pass elif(opt in ("-t","--type")): sub_type = arg if(sub_type not in ('word', 'scan')): usage() sys.exit() pass pass elif(opt in ("-m","--limit")): words_limit= int(arg) #print words_limit _is_words_show=True pass elif(opt == '-l'): #show lines _is_lines_show=True pass elif(opt == '-W'): #show words _is_words_show=True pass """ if(len(sys.argv)<2): print "need args!!" logger.error("need args!!sys.argv:{0}".format(sys.argv)) return None pass """ #print sys.argv #sub.addPunctuation([',','!',';','.',':','>','<']) #sub.addLexicon(["hello", "world"]) if sub.lexicon_path is None: sub.setLexiconFile("lexicon.pickle") sub.loadOldData() sub.addFiles(args) #sub.addStrings("hello world, I'm wang. Please call me wang.") sub.parse() if(_is_lines_show): sub.lines_show() pass if(_is_words_show): #print words_limit sub.words_show(words_limit) pass sub.show() if(sub_type =='word'): sub.dumpData() print endDtime = datetime.now() print "End time: "+str(endDtime) timedelta = endDtime-startDtime print "Cost time: "+str(timedelta) #getChecksum(sys.argv[1]) pass
def parse(self, content): subtitles = [] number = None start_time = None end_time = None text = "" self.i = 0 ch = content[self.i] token_count = len(self.analyser.tokens) while self.i < len(content): move_cursor = self.analyser.read_char(ch) # 检查是否有新的token被读到了 if len(self.analyser.tokens) > token_count: token_count += 1 new_token = self.analyser.tokens[-1] assert isinstance(new_token, SrtToken) # 根据当前state分析 if self.state == SrtParser.start_state: # 期待输入为标号类型 if new_token.type != SrtToken.TYPE_COUNTER: print("ERROR, TYPE COUNTER NEEDED BUT", new_token.type, "FOUND") return print("COUNTER", new_token.value) number = int(new_token.value) # 跳转到标号态 self.state = SrtParser.counter_state elif self.state == SrtParser.counter_state: if new_token.type != SrtToken.TYPE_TIMESTAMP: print("ERROR, TYPE TIMESTAMP NEEDED BUT", new_token.type, "FOUND") return print("START TIME", new_token.value) start_time = new_token.value # 跳转到开始时间态 self.state = SrtParser.start_time_state elif self.state == SrtParser.start_time_state: if new_token.type != SrtToken.TYPE_TIME_ARROW: print("ERROR, TYPE ARROW NEEDED BUT", new_token.type, "FOUND") return print(new_token.value) # 跳转到 --> 态 self.state = SrtParser.arrow_state elif self.state == SrtParser.arrow_state: if new_token.type != SrtToken.TYPE_TIMESTAMP: print("ERROR, TYPE TIMESTAMP NEEDED BUT", new_token.type, "FOUND") return print("END TIME", new_token.value) end_time = new_token.value # 结束时间态 self.state = SrtParser.end_time_state elif self.state == SrtParser.end_time_state or self.state == SrtParser.text_state: # 接受任意非换行符字符串(即空行) # print("TYPE:", new_token.type, [new_token.value]) if new_token.value != "\n": # 进入字幕态 self.state = SrtParser.text_state print("TEXT:", new_token.value) if not new_token.value.endswith("\n"): new_token.value += "\n" text += new_token.value else: # 跳到起始态 self.state = SrtParser.start_state print("------- END OF A BLOCK OF SUBTITLE -------") # 生成新的subtitle对象 subtitle = Subtitle(number, start_time, end_time, text) subtitles.append(subtitle) # 清空数据 number = None start_time = end_time = None text = "" if move_cursor: self.i += 1 if self.i < len(content): ch = content[self.i] else: break print("Tokens:") for token in self.analyser.tokens: print(token) return subtitles
def subtitles(self) -> List[Path]: try: return [Subtitle(subtitle) for subtitle in self.subtitles_path.iterdir()] except FileNotFoundError: return []
class Sub_testCase(unittest.TestCase): '''unit test for Subtitle Class''' def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) self.pkl = "../data/test.pickle" self.fname = '../data/vocabulary/Vocabulary -juniorHighschool(chinese) .txt' pass def setUp(self): self.sub = Subtitle(loglevel=logging.DEBUG) pass def tearDown(self): if os.path.exists(self.pkl): os.remove(self.pkl); pass pass def sub_assert(self, sub, lex=None, stem_lex=None, words=None, stem_words=None, new_words=None): """ """ if lex: self.assertEqual(len(sub.lexicon), lex) pass if stem_lex: self.assertEqual(len(sub.stem_lexicon), stem_lex) pass if words: self.assertLessEqual(len(sub.wordSet), words) pass if stem_words: self.assertLessEqual(len(sub.stem_newWords), stem_words) pass if new_words: self.assertLessEqual(len(sub.newWords), new_words) pass pass def test_addWord(self): self.sub.setLexiconFile(self.pkl) self.sub.addWord('eyes') #print type(self.sub.newWords) #assert type(self.sub.newWords) is Counter self.assertIs(self.sub.newWords, None) #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0) self.sub.parse() #self.sub.words_show() #print type(self.sub.newWords) self.assertIs(type(self.sub.newWords), Counter) #assert type(self.sub.newWords) is Counter #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1) self.sub.addWords(['anymore','sold']) #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0, words=1, stem_words=1, new_words=1) self.sub.parse() #self.sub.show() self.sub_assert(self.sub,lex=0, stem_lex=0, words=2, stem_words=2, new_words=2) #print self.sub.raw pass def test_dumpData(self): self.sub.setLexiconFile(self.pkl) #self.sub.loadOldData() self.sub.addFile(self.fname) self.sub.addFile('../data/vocabulary/Vocabulary -highschool(chinese).txt') self.sub.addFile('../data/vocabulary/Vocabulary-cet-4 (chinese).txt') self.sub.parse() #self.sub.show() self.sub.dumpData() self.sub_assert(self.sub, lex=0, stem_lex=0, words=4156, stem_words=4156, new_words=4156) ''' assert len(self.sub.lexicon) == 0 assert len(self.sub.stem_lexicon) == 0 assert len(self.sub.wordSet) <= 3929 assert len(self.sub.stem_newWords) <= 3929 assert len(self.sub.newWords) <= 3929 ''' pass def test_html_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() #sub_scan.addFile('https://www.coursera.org/') sub_scan.addFile('https://selenium-python.readthedocs.org/en/latest/index.html') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=127, stem_words=71, new_words=32) pass def test_txt_scan(self): self.test_dumpData() sub_scan = Subtitle(loglevel=logging.DEBUG) sub_scan.setLexiconFile(self.pkl) sub_scan.loadOldData() sub_scan.addFile('../data/srt/Lord.of.War.eng.480p.SDHF-NORMTEAM.srt') sub_scan.parse() #sub_scan.words_show(50) #sub_scan.show() self.sub_assert(sub_scan,lex=4156, stem_lex=3104, words=1936, stem_words=943, new_words=518) ''' assert len(sub_scan.lexicon) == 3929 assert len(sub_scan.stem_lexicon) == 2968 assert len(sub_scan.wordSet) <= 1807 assert len(sub_scan.stem_newWords) <= 922 assert len(sub_scan.newWords) <= 531 ''' names_mv=['Yuri', 'Simeon'] for n in names_mv: assert n in sub_scan.nameSet ''' if n in sub_scan.nameSet: print n; pass ''' pass #print sub_scan.nameSet for n in names_mv: ''' if n.lower() in sub_scan.newWords: print n; pass ''' assert n.lower() not in sub_scan.newWords pass pass def test_word(self): self.sub.setLexiconFile(self.pkl) #self.sub.loadOldData() self.sub.addFile(self.fname) self.sub.parse() #self.sub.show() self.sub_assert(self.sub, lex=0, stem_lex=0, words=1599, stem_words=1599, new_words=1599) ''' assert len(self.sub.lexicon) == 0 assert len(self.sub.stem_lexicon) == 0 assert len(self.sub.wordSet) <= 1449 assert len(self.sub.stem_newWords) <= 1449 assert len(self.sub.newWords) <= 1449 ''' pass pass
def new_ost_subtitle(self, lang:str=Constants.OST_LANG, filtr:Callable[[dict], bool]=lambda filtr : True) -> Optional[Subtitle]: return Subtitle.ost_new(self.imdbid, lang, filtr)
import util FRAME_PATH = "/media/data/mtriet/dataset/scnn_%s_frames" % sys.argv[1] SUB_PATH = "/media/data/mtriet/raw_video/%s/train" % (sys.argv[1]) WINDOW_SIZE = [16, 32, 64, 128, 256, 512] OVERLAP_RATE = 0.75 CLASSES = [[], []] # 0, 1 if len(sys.argv) < 3: print('fb pad=True/False') sys.exit(1) for window_size in WINDOW_SIZE: for frame_root, sub_folder, sub_files in os.walk(FRAME_PATH): for folder in sub_folder: subtitles = Subtitle.load_subtitle(SUB_PATH, folder, sys.argv[2]) frames = sorted(os.listdir(frame_root + '/' + folder)) sub_index = 0 for begin_pivot in range( 1, len(frames) - window_size, int(window_size * (1 - OVERLAP_RATE))): # ignore last few frames if (begin_pivot > subtitles[sub_index].end) and ( sub_index < len(subtitles) - 1): sub_index += 1 end_pivot = min(begin_pivot + window_size, len(frames)) segment = range(begin_pivot, begin_pivot + window_size + 1) sub_range = subtitles[sub_index].get_range() intersection = np.intersect1d(segment, sub_range)
'-p', dest='path_folder', type=str, required=True, help='Path folder to find .str files') self.parser.add_argument('--overwrite_file', '--o', '-o', dest='overwrite_file', type=str, required=True, help='Overwrite the file') self.args = self.parser.parse_args() def get_params(self): return self.args if __name__ == '__main__': Credits() params = Parameters().get_params() files = HandleFiles(params.path_folder, ".srt").find_files() for file in files: print(f'File: "{file}"') sub = Subtitle(file) try: print(params.overwrite_file) sub.run(params.overwrite_file) except: pass