def merge_srt(zh_file_path, en_file_path, new_file_path): zh_file = open(zh_file_path, mode='r', encoding='utf-8') zh_srt_s = zh_file.read() zh_file.close() en_file = open(en_file_path, mode='r', encoding='utf-8') en_srt_s = en_file.read() en_file.close() zh_subs = list(srt.parse(zh_srt_s)) zh_length = len(zh_subs) en_subs = list(srt.parse(en_srt_s)) en_length = len(en_subs) if zh_length != en_length: return False subs = [] for i in range(zh_length): temp_subtitle = zh_subs[i] temp_subtitle.content = temp_subtitle.content + '\n' + en_subs[ i].content temp_subtitle.content = srt.make_legal_content(temp_subtitle.content) subs.append(temp_subtitle) srt_s = srt.compose(subs) # 将srt_s输出到new_file new_file = open(new_file_path, 'w+', encoding='utf-8') new_file.write(srt_s) new_file.close() return True
def zameniImena(text_in): if len(list(srt.parse(text_in))) == 0: logger.debug(f"Transkrib, No subtitles found.") else: text_in = srt.compose(srt.parse(text_in, ignore_errors=True)) robj1 = re.compile(r'\b(' + '|'.join(map(re.escape, dictionary_1.keys())) + r')\b') robj2 = re.compile(r'\b(' + '|'.join(map(re.escape, dictionary_2.keys())) + r')\b') robj3 = re.compile(r'\b(' + '|'.join(map(re.escape, dictionary_0.keys())) + r')\b') robjN1 = re.compile(r'\b(' + '|'.join(map(re.escape, dict1_n.keys())) + r')\b') robjN2 = re.compile(r'\b(' + '|'.join(map(re.escape, dict2_n.keys())) + r')\b') robjN0 = re.compile(r'\b(' + '|'.join(map(re.escape, dict0_n.keys())) + r')\b') robjL0 = re.compile(r'\b(' + '|'.join(map(re.escape, dict0_n2.keys())) + r')\b') robjL1 = re.compile(r'\b(' + '|'.join(map(re.escape, dict1_n2.keys())) + r')\b') robjL2 = re.compile(r'\b(' + '|'.join(map(re.escape, dict2_n2.keys())) + r')\b') try: t_out1 = robj1.subn(lambda x: dictionary_1[x.group(0)], text_in) t_out2 = robj2.subn(lambda x: dictionary_2[x.group(0)], t_out1[0]) t_out3 = robj3.subn(lambda x: dictionary_0[x.group(0)], t_out2[0]) t_out4 = robjN1.subn(lambda x: dict1_n[x.group(0)], t_out3[0]) t_out5 = robjN2.subn(lambda x: dict2_n[x.group(0)], t_out4[0]) t_out6 = robjN0.subn(lambda x: dict0_n[x.group(0)], t_out5[0]) except Exception as e: logger.debug(F"Transkripcija, error: {e}") def doRepl(inobj, indict, text): try: out = inobj.subn(lambda x: indict[x.group(0)], text) return out[1] except IOError as e: logger.debug(f"Replace keys, I/O error: {e}") except Exception as e: logger.debug(f"Replace keys, unexpected error: {e}") if len(dict1_n2) != 0: doRepl(robjL1, dict1_n2, t_out6[0]) if len(dict2_n2) != 0: doRepl(robjL2, dict2_n2, t_out6[0]) if len(dict0_n2) != 0: doRepl(robjL0, dict0_n2, t_out6[0]) much = t_out1[1] + t_out2[1] + t_out3[1] + t_out4[1] + t_out5[1] + t_out6[1] logger.debug( 'Transkripcija u toku.\n--------------------------------------') logger.debug(f'Zamenjeno ukupno {much} imena i pojmova') return much, t_out6[0]
def set_basic_args(args): # TODO: dedupe some of this if getattr(args, "inplace", None): if args.input == DASH_STREAM_MAP["input"]: raise ValueError("Cannot use --inplace on stdin") if args.output != DASH_STREAM_MAP["output"]: raise ValueError("Cannot use -o and -p together") args.output = args.input for stream_name in ("input", "output"): log.debug('Processing stream "%s"', stream_name) try: stream = getattr(args, stream_name) except AttributeError: # For example, in the case of no_output continue # We don't use system default encoding, because usually one runs this # on files they got from elsewhere. As such, be opinionated that these # files are probably UTF-8. Looking for the BOM on reading allows us to # be more liberal with what we accept, without adding BOMs on write. read_encoding = args.encoding or "utf-8-sig" write_encoding = args.encoding or "utf-8" r_enc = codecs.getreader(read_encoding) w_enc = codecs.getwriter(write_encoding) log.debug("Got %r as stream", stream) # We don't use encoding= option to open because we want to have the # same universal newlines behaviour as STD{IN,OUT}_BYTESTREAM if stream in DASH_STREAM_MAP.values(): log.debug("%s in DASH_STREAM_MAP", stream_name) if stream is args.input: args.input = srt.parse(r_enc(args.input).read()) elif stream is args.output: # Since args.output is not in text mode (since we didn't # earlier know the encoding), we have no universal newline # support and need to do it ourselves args.output = w_enc(args.output) else: log.debug("%s not in DASH_STREAM_MAP", stream_name) if stream is args.input: if isinstance(args.input, collections.MutableSequence): for i, input_fn in enumerate(args.input): if input_fn in DASH_STREAM_MAP.values(): if stream is args.input: args.input[i] = srt.parse(r_enc(input_fn).read()) else: f = r_enc(open(input_fn, "rb")) with f: args.input[i] = srt.parse(f.read()) else: f = r_enc(open(stream, "rb")) with f: args.input = srt.parse(f.read()) else: args.output = w_enc(open(args.output, "wb"))
def __init__(self, subtitle_path: str, language: str) -> None: """ Base class for LearnSubtitles. :type subtitle_path: str : path for the srt file :param language: language of the subtitle. :param text: Subtitle text (not tokenized). :important_words: tokenized text, without stopwords :param study """ self.subtitle_path = subtitle_path self.language = language self.nlp = select_spacy_model(self.spacy_default_models[language]) self.text = "" self.tokens = "" self.important_words = "" self.study_dict = {} self.film_level = 0 # Open subtitle file and pre-process it try: with open(subtitle_path) as raw_subtitle: subs = list(srt.parse(raw_subtitle)) # extract texts with srt for i in range(len(subs)): self.text += clean_line(subs[i].content) + " " self.text = clean_text(self.text) except srt.SRTParseError: print("The srt file has parsing problems. Trying to fix the File.") with open(subtitle_path, "r") as f: lines = f.readlines() maintain = False with open("reworked_subtitle.srt", "w") as new_f: for line in lines: if re.match("^.?1(\n)?$", line) != None or maintain: if not maintain: new_f.write("1\n") else: new_f.write(line) maintain = True try: with open("reworked_subtitle.srt") as raw_subtitle: subs = list( srt.parse(raw_subtitle)) # extract texts with srt for i in range(len(subs)): self.text = self.text + clean_line( subs[i].content) + " " self.text = clean_text(self.text) except srt.SRTParseError as error: print( "The srt file has parsing problems that could not be fixed." ) raise error self.__tokenize_and_process() self.__create_study_dicts()
def main(argv): # Parse arguments primary_language = '' secondary_language = '' try: opts, args = getopt.getopt( argv, "hp:s:", ["primary-language=", "secondary-language="]) except getopt.GetoptError: print ('merge_subtitles.py -p <primary_language>' '-s <secondary_language>') sys.exit(2) for opt, arg in opts: if opt in ('-h', '--help'): print ('merge_subtitles.py -p <primary_language>' ' -s <secondary_language>') sys.exit() elif opt in ("-p", "--primary-language"): primary_language = arg elif opt in ("-s", "--secondary-language"): secondary_language = arg # Read files and convert to list primary_path = glob.glob('./*.' + primary_language + '.srt')[0] secondary_path = glob.glob('./*.' + secondary_language + '.srt')[0] primary_file = open(primary_path, 'r', errors='ignore') primary_text = primary_file.read() primary_file.close() secondary_file = open(secondary_path, 'r', errors='ignore') secondary_text = secondary_file.read() secondary_file.close() subtitle_generator_primary = srt.parse(primary_text) subtitles_primary = list(subtitle_generator_primary) subtitle_generator_secondary = srt.parse(secondary_text) subtitles_secondary = list(subtitle_generator_secondary) # Make primary yellow for s in subtitles_primary: s.content = '<font color="#ffff54">' + s.content + '</font>' # Place secondary on top for s in subtitles_secondary: s.content = '{\\an8}' + s.content # Merge subtitles_merged = subtitles_primary + subtitles_secondary subtitles_merged = list(srt.sort_and_reindex(subtitles_merged)) # Write merged to file merged_path = primary_path.replace(primary_language, 'merged') merged_text = srt.compose(subtitles_merged) merged_file = open(merged_path, 'w') merged_file.write(merged_text) merged_file.close()
def test_subtitle_from_scratch_equality(subtitle): srt_block = subtitle.to_srt() # Get two totally new sets of objects so as not to affect the hash # comparison sub_1 = list(srt.parse(srt_block))[0] sub_2 = list(srt.parse(srt_block))[0] subs_eq([sub_1], [sub_2]) # In case subs_eq and eq disagree for some reason assert sub_1 == sub_2 assert hash(sub_1) == hash(sub_2)
def test_subtitle_from_scratch_equality(subtitle): srt_block = subtitle.to_srt() # Get two totally new sets of objects so as not to affect the hash # comparison sub_1 = list(srt.parse(srt_block))[0] sub_2 = list(srt.parse(srt_block))[0] subs_eq([sub_1], [sub_2]) # In case subs_eq and eq disagree for some reason eq(sub_1, sub_2) eq(hash(sub_1), hash(sub_2))
def GetText(self): """""" n_subs = list(srt.parse(self.text_2.GetValue(), True)) d_subs = list(srt.parse(self.default_subs, True)) for x in self.new_subs: for i in d_subs: if i.index == x.index and i.content != x.content: d_subs[d_subs.index(i)] = x for x in n_subs: for i in d_subs: if i.index == x.index and i.content != x.content: d_subs[d_subs.index(i)] = x return srt.compose(d_subs)
def gather_movie_vocabulary(path, name_of_movie): regex_words =r"[a-zA-Z]+'*[a-zA-Z]+" regex_sentences =r".*" #FIXME todays_date = datetime.datetime.today().strftime('%Y %b%d') output = {} with open(path,"r") as fin: str = fin.read() subtitle_generator = srt.parse(str) for subtitle in subtitle_generator: sentences = re.findall(regex_sentences, subtitle.content) for sentence in sentences: #print('sentence', sentence) words = re.findall(regex_words, sentence) #print(words) for word in words: word_data = output.setdefault(word, {}) word_data.setdefault('sample_sentences', []).append(sentence) if 'first_start_time' not in word_data: word_data['first_start_time'] = subtitle.start if 'first_end_time' not in word_data: word_data['first_end_time'] = subtitle.end word_data['source'] = name_of_movie word_data['date_added'] = todays_date return output
def test_compose_and_parse_strict_mode(content): content = "\n" + content + "\n\n" + content + "\n" sub = CONTENTLESS_SUB(content=content) parsed_strict = list(srt.parse(sub.to_srt()))[0] parsed_unstrict = list(srt.parse(sub.to_srt(strict=False)))[0] # Strict mode should remove blank lines in content, leading, and trailing # newlines. assert_false(parsed_strict.content.startswith("\n")) assert_false(parsed_strict.content.endswith("\n")) assert_false("\n\n" in parsed_strict.content) # When strict mode is false, no processing should be applied to the # content (other than \r\n becoming \n). eq(parsed_unstrict.content, sub.content.replace("\r\n", "\n"))
def convert_txt(): f = open("merge.srt","r",encoding="utf-8") srtfile = list(srt.parse(f, ignore_errors=False)) with open("lyrics.txt","a",encoding="utf-8") as txtfile: for i in range (len(srtfile)): txtfile.write(srtfile[i].content+"\n\n") f.close()
def extract_voice_intervals(audio_file: str, sub_file: str, shrink: timedelta = timedelta(seconds=0), lag: timedelta = timedelta(seconds=0)) -> Tuple[int, List[np.ndarray]]: """ Useful for testing accuracy of subtitles in tandem with wavfile.write("file_name.wav", rate, intr[numb]). :param shrink: Lag added to the end time of subtitle in order to shrink the interval :param audio_file: Path of a file from which voice intervals should be extracted :param sub_file: Path of a file with subtitles corresponding to the audio file :param lag: Lag value :return: Tuple 0: Rate of the audio which has been processed 1: List of intervals containing voice samples """ rate, sig = wavfile.read(audio_file) def to_sample_intervals(sub: srt.Subtitle): def pos(date: timedelta): return int(round(date.total_seconds() * rate)) return sig[pos(sub.start): pos(sub.end)] with open(sub_file, 'r', encoding='ISO-8859-15') as subs_raw: subs = list(srt.parse(subs_raw.read())) return rate, list(map(to_sample_intervals, intervals_from_subtitles(subs, shrink=shrink, lag=lag)))
def prepare_test_data(audio_file, subs_file, output_file, shrink=timedelta(seconds=0.0), lag=timedelta(seconds=0.1)): """ Extracts mfcc features from audio file along with with labels marking each sample with either 0 or 1 value corresponding to absence or presence of the voice is sampled piece accordingly. :param shrink: Lag added to the end time of subtitle in order to shrink the interval :param audio_file: The path to audio file :param subs_file: The path to sub file :param output_file: The path where SdaContent should be saved to :param lag: Lag of the file :return: """ frame_step = 0.01 frame_size = 0.025 with open(subs_file, 'r', encoding='ISO-8859-15') as subs_raw: subs = list(srt.parse(subs_raw.read())) labels = prepare_labels_from_subs(subs, frame_step, shrink=shrink, lag=lag) rate, sig = wavfile.read(audio_file) mfcc_features = prepare_mfcc(sig, rate, frame_step, frame_size) labels = np.pad(labels, (0, mfcc_features.shape[0] - labels.shape[0]), constant_values=.0) test_data = SdaContent(rate, frame_step, mfcc_features, labels) with open(output_file, 'wb') as output: pickle.dump(test_data, output)
def extract_ref(filename, subs_query, output_query): text = '' with codecs.open(filename, 'r', encoding='utf8') as f: text = f.read() subs = list(srt.parse(text)) i = 0 output = [] for a, b in output_query: start = subs_query[a].start end = subs_query[b].end while i < len(subs) and subs[i].end < start: i += 1 st = i while i + 1 < len(subs) and subs[i + 1].start <= end: i += 1 if st == len(subs): output.append((0, -1)) else: output.append((st, i)) return subs, output
def parseSRT(self, srt_filename): f=open(srt_filename, "r") subtitle_generate = srt.parse(f.read()) f.close() self.subtitles = list(subtitle_generate) return self.subtitles
def download_subtitle(episode): import srt episode.reload() LOG.debug('Downloading subtitle from PMS') pms = episode._server to_dl = [] all_subs = [] for part in episode.iterParts(): if part.subtitleStreams(): for sub in part.subtitleStreams(): if sub.key and sub.codec == 'srt': to_dl.append( pms.url('%s?download=1' % sub.key, includeToken=True)) for dl_url in to_dl: r = requests.get(dl_url) r.raise_for_status() if r: try: a_sub = list(srt.parse(r.text)) all_subs.append(a_sub) except ValueError: LOG.exception('Failed to parse subtitle') return all_subs
def fit(self, fname, *_): if self.caching and self.fit_fname == fname: return self encodings_to_try = (self.encoding,) with open_file(fname, 'rb') as f: subs = f.read() if self.encoding == 'infer': encodings_to_try = (cchardet.detect(subs)['encoding'],) exc = None for encoding in encodings_to_try: try: decoded_subs = subs.decode(encoding, errors='replace').strip() if self.sub_format == 'srt': parsed_subs = srt.parse(decoded_subs) elif self.sub_format in ('ass', 'ssa'): parsed_subs = pysubs2.SSAFile.from_string(decoded_subs) else: raise NotImplementedError('unsupported format: %s' % self.sub_format) self.subs_ = GenericSubtitlesFile( _preprocess_subs(parsed_subs, max_subtitle_seconds=self.max_subtitle_seconds, start_seconds=self.start_seconds), sub_format=self.sub_format, encoding=encoding ) self.fit_fname = fname self.detected_encoding_ = encoding logger.info('detected encoding: %s' % self.detected_encoding_) return self except Exception as e: exc = e continue raise exc
def test_parsing_no_content(subs): for subtitle in subs: subtitle.content = "" reparsed_subtitles = srt.parse( srt.compose(subs, reindex=False, strict=False)) subs_eq(reparsed_subtitles, subs)
def main(args): apg1.add_argument("--subtitle", type=FileType("r"), help="subtitle file for -text") apg1.add_argument("--subtitle-placeholder", type=str, default="#", help="placeholder for subtitle") apg1.add_argument("--mon-background", type=str, default=None, help="replacement back-color for mon (default -key-color)") readSrt = lambda it: srt.parse(it.read()) cfg = app.parse_args(args) cfg.font = ImageFont.truetype(cfg.font, cfg.font_size) if cfg.font != None else ImageFont.load_default() cfg.key_color = colorFromHtml(cfg.key_color) print(f"{cfg.font_size}px, {cfg.key_color} ±{cfg.key_thres} {cfg.spacing}") cfg.calc_draw_color = lambda c: None if isColorNearTo(cfg.key_color, cfg.key_thres, c) else c for path in cfg.images: (name, ext) = fileExtNameSplit(path) if ext in "mp4 webm mkv flv".split(" "): cap = VideoCapture(path) (fps, count, width, height) = cv2VideoInfo(cap) print(f"{fps}fps*{count} {width}x{height}") mon = Montage(cfg, (width, height) ) playCvMontage(cap, mon, filename=f"{name}_mon.avi", subtitle=let(readSrt, cfg.subtitle), placeholder=cfg.subtitle_placeholder) cap.release() else: image = Image.open(path) mon = Montage(cfg, image.size) mon.runOn(image).save(f"{name}_mon.png")
def run(self) -> None: """ 번역된 srt 파일 생성 :return: None """ with self.__src_file.open('rt') as fp: file_contents = fp.read() filename = self.__src_file.name google_trans = google_translator() subtitles = list() for sub in srt.parse(file_contents): translated_content = google_trans.translate( sub.content, lang_src=self.__lang_src, lang_tgt=self.__lang_tgt) tmp_sub = srt.Subtitle(index=sub.index, start=sub.start, end=sub.end, content=translated_content, proprietary=sub.proprietary) sys.stdout.write('[{0}][{1}]: {2}\n'.format( filename, tmp_sub.index, tmp_sub.content)) subtitles.append(tmp_sub) # 번역된 srt 파일 쓰기 dst_file = self.__dst_dirpath / self.__src_file.name with dst_file.open('wt') as fp: fp.write(srt.compose(subtitles))
def fragment_srt(text, log_reject): chunks = [] subs = list(sub for sub in srt.parse(text) if remove_spaces_punctuation( sub.content)) # filter ones with no useful chars, like '♬~' accum = [ ] # accumulate multiple subs if they end with "continuation characters" (arrows) frags = [] for i in range(len(subs)): sub = subs[i] accum.append(sub) # process this accumlated group if no continuation char or we're on last sub if (sub.content.strip()[-1] not in SUBTITLE_CONTINUATION_CHARS) or (i == (len(subs) - 1)): single_line_content = ' '.join(sub.content.strip().rstrip( SUBTITLE_CONTINUATION_CHARS).strip('\r').replace('\n', ' ') for sub in accum) start_time = accum[0].start.total_seconds() end_time = accum[-1].end.total_seconds() frag_texts = clean_and_divide(single_line_content, log_reject) for frag_text in frag_texts: frags.append({ 'text': frag_text, 'loc': f't:{start_time:.3f}-{end_time:.3f}', }) accum = [] return frags
def _srt_parse(s, max_subtitle_seconds=None, start_seconds=0, tolerant=True): start_time = timedelta(seconds=start_seconds) subs = srt.parse(s) subs_list = [] max_duration = timedelta(days=1) if max_subtitle_seconds is not None: max_duration = timedelta(seconds=max_subtitle_seconds) while True: try: next_sub = next(subs) if next_sub.start < start_time: continue next_sub.end = min(next_sub.end, next_sub.start + max_duration) subs_list.append(next_sub) # We don't catch SRTParseError here b/c that typically raised when we # are trying to parse with the wrong encoding, in which case we might # be able to try another one on the *entire* set of subtitles elsewhere. except ValueError as e: if tolerant: logger.warning(e) continue else: raise except StopIteration: break return subs_list
def get_text_from_url(url, timestamps): yt = YouTube(url) print(yt.caption_tracks) caption = None for c in yt.caption_tracks: # We do not want autogen caption if 'auto' in c.name: continue caption = c if caption == None: caption = yt.caption_tracks[0] if 'auto' in caption.name: print("AUTO IN CAPTION NAME") caption_srt = caption.generate_srt_captions() subtitle_generator = srt.parse(caption_srt) timestamp_to_text = {} for i in range(len(timestamps)): timestamp_to_text[i] = "" t_i = 0 for sub in subtitle_generator: start = sub.start.total_seconds() end = sub.end.total_seconds() if start > timestamps[t_i][0]: timestamp_to_text[t_i] += sub.content + " " if end > timestamps[t_i][1]: t_i += 1 if t_i >= len(timestamps): break timestamp_to_text[t_i] += sub.content + " " return timestamp_to_text
def on_pick_sub_file(self, widget): dialog = Gtk.FileChooserDialog( title="Please choose a file", parent=self, action=Gtk.FileChooserAction.OPEN ) dialog.add_buttons( Gtk.STOCK_CANCEL, Gtk.ResponseType.CANCEL, Gtk.STOCK_OPEN, Gtk.ResponseType.OK, ) # self.add_filters(dialog) response = dialog.run() if response == Gtk.ResponseType.OK: print("Open clicked") file_name = dialog.get_filename() print("File selected: " + file_name) self.sub_uri = file_name elif response == Gtk.ResponseType.CANCEL: print("Cancel clicked") dialog.destroy() print(self.sub_uri) self.sub_list = [] filepath = self.sub_uri.strip() if os.path.isfile(filepath): filepath = os.path.realpath(filepath) with open(filepath) as f: read_data = f.read() subs = parse(read_data) for sub in subs: self.sub_list.append((sub.index, str(sub.start), str(sub.end), sub.content)) # print(self.sub_list) # self.player.set_property("suburi", self.sub_uri) # 这里的内容应该放入单独的method中 if len(self.sub_list) > 0: self.sub_liststore = Gtk.ListStore(int, str, str, str) for sub_ref in self.sub_list: self.sub_liststore.append(list(sub_ref)) self.subview = Gtk.TreeView(model=self.sub_liststore) for i, column_title in enumerate( ["index", "start", "end", "content"] ): renderer = Gtk.CellRendererText() column = Gtk.TreeViewColumn(column_title, renderer, text=i) self.subview.append_column(column) select = self.subview.get_selection() select.connect("changed", self.on_tree_selection_changed) # setting up the layout, putting the treeview in a scrollwindow, and the buttons in a row self.scrollable_treelist.add(self.subview) print(self.subview) print(self.scrollable_treelist) self.subview.show_all()
def chunk_srt(text): chunks = [] subs = list(sub for sub in srt.parse(text) if remove_spaces_punctuation( sub.content)) # filter ones with no useful chars, like '♬~' grouped_subs = group_subs_list(subs) for group in grouped_subs: text_pieces = [] html_pieces = [] for sub in group: start_time = sub.start.total_seconds() end_time = sub.end.total_seconds() cleaned_content = jaconv.h2z(sub.content).strip( ) # h2z only affects kana by default, which is what we want text_pieces.append(cleaned_content) html_pieces.append( f'<p t0="{start_time:.3f}" t1="{end_time:.3f}">' + html.escape(cleaned_content).replace('\n', '<br>') + f'</p>') chunks.append({ 'text': '\n'.join(text_pieces), 'html': '\n'.join(html_pieces), }) return chunks
def cleanSubs(self): for filename in self.filenames[0]: with open(filename, "r", encoding="utf8") as file: lines = file.readlines() file.close() validLines = [] for line in lines: line = re.sub("(.*)", "", line) line = re.sub("\\(.*\\)", "", line) validLines.append(line) sub = ''.join(map(str, validLines)) subGen = srt.parse(sub) subtitles = list(subGen) clean = [] for sub in subtitles: if sub.content is not None: clean.append(sub) final = srt.compose(clean) with open(filename, "w", encoding="utf8") as file: for f in final: file.write(f) file.close() dlg = QDialog(self) dlg.setWindowTitle("Done!") layout = QVBoxLayout() dlg.setLayout(layout) dlg.layout().addWidget(QLabel("Success!")) dlg.exec_()
def get_list_of_subtitle_content(subtitles): srt_generator = srt.parse(subtitles) try: return [chunk.content.replace('\n', ' ') for chunk in srt_generator] except srt.SRTParseError: # number_corrupted_files += 1 return []
def loadSubtitlesFromFile(self): parser_parameters = sub_parser.ModuleParameters( self.ui.filePathInput.text()) raw_sub = sub_parser.read_subtitles_file(parser_parameters.subtitle) sub_generator = srt.parse(raw_sub) self.subtitles = sub_parser.parse_subtitles(sub_generator) self.populate_words_list()
def shiftSubsNegative(self): for filename in self.filenames[0]: with open(filename, "r", encoding="utf8") as file: lines = file.readlines() file.close() sub = ''.join(map(str, lines)) subGen = srt.parse(sub) subtitles = list(subGen) retimed = [] for sub in subtitles: sub.start = sub.start - datetime.timedelta( milliseconds=self.timeShift) sub.end = sub.end - datetime.timedelta( milliseconds=self.timeShift) retimed.append(sub) final = srt.compose(retimed) with open(filename, "w", encoding="utf8") as file: for f in final: file.write(f) file.close() dlg = QDialog(self) dlg.setWindowTitle("Retiming done!") layout = QVBoxLayout() dlg.setLayout(layout) dlg.layout().addWidget(QLabel("Retiming successful!")) dlg.exec_()
def __init__( self, id="", f=None, # one or many (list) file comment='##', set_id_as_prog=True, debug=False): if isinstance(f, str): f = [f] elif isinstance(f, list): pass else: raise ValueError("f must be a str path or a list of path") # id of the srt self.id = id # raw content content = [] for file in sorted(f): with open(file) as s: for line in s: if not line.startswith(comment): content.append(line) if debug: self.raw = content # parsing subs_generator = srt.parse("".join(content)) subs = list(subs_generator) if set_id_as_prog: subs = list(srt.sort_and_reindex(subs)) self.subs = subs
def test_compose_and_parse_strict_mode(content): content = '\n' + content + '\n\n' + content + '\n' sub = CONTENTLESS_SUB(content=content) parsed_strict = list(srt.parse(sub.to_srt()))[0] parsed_unstrict = list(srt.parse(sub.to_srt(strict=False)))[0] # Strict mode should remove blank lines in content, leading, and trailing # newlines. assert_false(parsed_strict.content.startswith('\n')) assert_false(parsed_strict.content.endswith('\n')) assert_false('\n\n' in parsed_strict.content) # When strict mode is false, no processing should be applied to the # content (other than \r\n becoming \n). eq(parsed_unstrict.content, sub.content.replace('\r\n', '\n'))
def importLyrics(self, lyricsPath): self.subtitleDict = {"lyrics": [], "timestamps": []} #open the subtitles file and remove all non-ascii characters try: srtFile = open(lyricsPath, "r").read().encode('ascii', 'ignore').decode() except: logger.logData(source="Lyrics", priority="WARN", msgType="Import failed", msgData=(lyricsPath)) return subs = srt.parse(srtFile) self.lyricsImported = 1 for subEntry in subs: self.subtitleDict["lyrics"].append(subEntry.content) self.subtitleDict["timestamps"].append(subEntry.start) # startTime = subEntry.start # startTimestamp = startTime.seconds + startTime.microseconds/1000000. # self.subtitleDict["timestamps"].append(startTimestamp) #get the timestamp of the last entry to find the duration of the subtitle file lastTime = subEntry.start numEntries = len(self.subtitleDict['lyrics']) logger.logData(source="Lyrics", priority="INFO", msgType="Import", msgData=(lyricsPath, numEntries, lastTime))
def test_parsing_content_with_blank_lines(subs): for subtitle in subs: # We stuff a blank line in the middle so as to trigger the "special" # content parsing for erroneous SRT files that have blank lines. subtitle.content = subtitle.content + "\n\n" + subtitle.content reparsed_subtitles = srt.parse(srt.compose(subs, reindex=False, strict=False)) subs_eq(reparsed_subtitles, subs)
def test_parsing_no_content(subs): for subtitle in subs: subtitle.content = '' reparsed_subtitles = srt.parse(srt.compose( subs, reindex=False, strict=False, )) subs_eq(reparsed_subtitles, subs)
def test_can_compose_without_ending_blank_line(input_subs): ''' Many sub editors don't add a blank line to the end, and many editors accept it. We should just accept this too in input. ''' composed = srt.compose(input_subs, reindex=False) composed_without_ending_blank = composed[:-1] reparsed_subs = srt.parse(composed_without_ending_blank) subs_eq(reparsed_subs, input_subs)
def test_compose_and_parse_strict_crlf(input_subs): composed_raw = srt.compose(input_subs, reindex=False) composed = composed_raw.replace('\n', '\r\n') reparsed_subs = list(srt.parse(composed)) for sub in reparsed_subs: sub.content = sub.content.replace('\r\n', '\n') subs_eq(reparsed_subs, input_subs)
def set_basic_args(args): if not args.encoding: args.encoding = sys.getdefaultencoding() # TODO: dedupe some of this for stream_name in ('input', 'output'): log.debug('Processing stream "%s"', stream_name) try: stream = getattr(args, stream_name) except AttributeError: # For example, in the case of no_output continue r_enc = codecs.getreader(args.encoding) w_enc = codecs.getwriter(args.encoding) log.debug('Got %r as stream', stream) if stream in DASH_STREAM_MAP.values(): log.debug('%s in DASH_STREAM_MAP', stream_name) if stream is args.input: args.input = srt.parse(r_enc(args.input).read()) elif stream is args.output: args.output = w_enc(args.output) else: log.debug('%s not in DASH_STREAM_MAP', stream_name) if stream is args.input: if isinstance(args.input, collections.MutableSequence): for i, input_fn in enumerate(args.input): if input_fn in DASH_STREAM_MAP.values(): if stream is args.input: args.input[i] = srt.parse( r_enc(input_fn).read() ) else: f = _open(input_fn, 'r', encoding=args.encoding) with f: args.input[i] = srt.parse(f.read()) else: f = _open(stream, 'r', encoding=args.encoding) with f: args.input = srt.parse(f.read()) else: args.output = _open(args.output, 'w', encoding=args.encoding)
def test_parser_accepts_no_newline_no_content(subs): for sub in subs: # Limit size so we know how many lines to remove sub.content = '' # Remove the last \n so that there is only one stripped_srt_blocks = ''.join(sub.to_srt()[:-1] for sub in subs) reparsed_subs = srt.parse(stripped_srt_blocks) subs_eq(reparsed_subs, subs)
def test_can_parse_index_trailing_ws(input_subs, whitespace): out = "" for sub in input_subs: lines = sub.to_srt().split("\n") lines[0] = lines[0] + "".join(whitespace) out += "\n".join(lines) reparsed_subs = srt.parse(out) subs_eq(reparsed_subs, input_subs)
def test_can_parse_index_leading_zeroes(input_subs, zeroes): out = "" for sub in input_subs: lines = sub.to_srt().split("\n") lines[0] = "".join(zeroes) + lines[0] out += "\n".join(lines) reparsed_subs = srt.parse(out) subs_eq(reparsed_subs, input_subs)
def srt_to_sub(srt_file): with codecs.open(srt_file,'r', 'utf-8') as myfile: data = myfile.read() subtitle_generator = srt.parse(data) subtitles = list(subtitle_generator) # account for youtube's 'scrolling' subtitles subtitles = fix_srt_overlap(subtitles) # replace digits with numbers subtitles = replace_numbers(subtitles) return subtitles
def test_parser_can_parse_with_fullwidth_delimiter(subs): original_srt_blocks = [sub.to_srt() for sub in subs] dot_srt_blocks = [] for srt_block in original_srt_blocks: srt_lines = srt_block.split("\n") dot_timestamp = srt_lines[1].replace(",", ",", 1).replace(":", ":", 1) srt_lines[1] = dot_timestamp dot_srt_blocks.append("\n".join(srt_lines)) composed_with_fullwidth = "".join(dot_srt_blocks) reparsed_subs = srt.parse(composed_with_fullwidth) subs_eq(reparsed_subs, subs)
def test_parser_noncontiguous(subs, fake_idx, garbage, fake_timedelta): composed = srt.compose(subs) # Put some garbage between subs that should trigger our failed parsing # detection. Since we do some magic to try and detect blank lines that # don't really delimit subtitles, it has to look at least a little like an # SRT block. srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) composed = composed.replace( "\n\n", "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) ) with assert_raises(srt.SRTParseError): list(srt.parse(composed))
def __init__(self, options): SourceVideo.__init__(self, options) #subtitles = "--write-sub" if options["subtitles"] else "" self.youtube_id = options["youtube_id"] self.path = '%s/%s.mp4' % (options["dir"], self.youtube_id) if os.path.exists( self.path )==False: url = "http://www.youtube.com/watch?v=%s" % (self.youtube_id) cmd = 'youtube-dl --no-overwrites --write-info-json -f 18 -o %%(id)s.%%(ext)s %s' % (url) dl_process = Popen(cmd.split(' '), stdout=PIPE, cwd=options["dir"]) out, err = dl_process.communicate() self.srt = '%s/%s.en.srt' % (options["dir"], self.youtube_id) self.timecodes = None if os.path.exists( self.srt ): self.timecodes = srt.parse( self.srt )
def test_parser_can_parse_with_dot_msec_delimiter(subs): original_srt_blocks = [sub.to_srt() for sub in subs] dot_srt_blocks = [] for srt_block in original_srt_blocks: srt_lines = srt_block.split('\n') # We should only do the first two, as it might also be in the # proprietary metadata, causing this test to fail. dot_timestamp = srt_lines[1].replace(',', '.', 2) srt_lines[1] = dot_timestamp dot_srt_blocks.append('\n'.join(srt_lines)) composed_with_dots = ''.join(dot_srt_blocks) reparsed_subs = srt.parse(composed_with_dots) subs_eq(reparsed_subs, subs)
def test_parser_didnt_match_to_end_raises(subs, fake_idx, garbage, fake_timedelta): srt_blocks = [sub.to_srt() for sub in subs] srt_timestamp = srt.timedelta_to_srt_timestamp(fake_timedelta) garbage = "\n\n%d\n%s %s" % (fake_idx, srt_timestamp, garbage) srt_blocks.append(garbage) composed = "".join(srt_blocks) with assert_raises(srt.SRTParseError) as thrown_exc: list(srt.parse(composed)) # Since we will consume as many \n as needed until we meet the lookahead # assertion, leading newlines in `garbage` will be stripped. garbage_stripped = garbage.lstrip("\n") eq(garbage_stripped, thrown_exc.exception.unmatched_content) eq(len(composed) - len(garbage_stripped), thrown_exc.exception.expected_start) eq(len(composed), thrown_exc.exception.actual_start)
2 00:00:14,815 --> 00:00:16,498 Hello, Jia Ruipeng 3 00:00:16,934 --> 00:00:17,814 Hello, UCAS ''' # open('./tmp.srt').read() ################################################################## ## 手动处理 sections = srt_text.split('\n\n') # 每三行一个空行, 进行分割 print(len(sections)) # 3 results = [[item for item in section.split('\n')] for section in sections] print(results[0][1].split(' ')[0].split(',')[0]) # 00:00:12 ################################################################## ## srt 库 import srt, pprint gen = list(srt.parse(srt_text)); print(type(gen)) pprint.pprint(gen) # [Subtitle(index=1, start=datetime.timedelta(0, 12, 815000), end=datetime.timedelta(0, 14, 509000), content='Hello, world.', proprietary=''), # Subtitle(index=2, start=datetime.timedelta(0, 14, 815000), end=datetime.timedelta(0, 16, 498000), content='Hello, Jia Ruipeng', proprietary=''), # Subtitle(index=3, start=datetime.timedelta(0, 16, 934000), end=datetime.timedelta(0, 17, 814000), content='Hello, UCAS', proprietary='')] print(type(gen[0])) # <class 'srt.Subtitle'> sub_1 = gen[0] print(sub_1.start) # 0:00:12.815000 print(sub_1.end) # 0:00:14.509000 print(sub_1.index) # 1 print(sub_1.content) # Hello, world. print(sub_1.to_srt)
def main(): #get absolute path of this script abs_path = os.path.dirname(os.path.abspath(__file__)) + "/" #read from configuration file config = ConfigParser.ConfigParser() config.readfp(open(abs_path + 'config.txt', 'r')) default_lang = config.get('multicaptions config', 'default_lang') launch_state = config.get('multicaptions config', 'launch_state') video = abs_path + config.get('multicaptions config', 'video_filename') subtitle_directory = abs_path + config.get('multicaptions config', 'subtitle_directory') #initiate assorted variables for tracking subtitle duration subtitle_duration = config.get('multicaptions config', 'subtitle_duration') played_through_once = False sub_start_position = "" #create/write over .count_plays (number of playthroughs) f = open(abs_path + '.count_plays', 'w') f.write("0") f.close() #set up serial connection to Arduino ports = list(serial.tools.list_ports.comports()) for p in ports: if "ACM" in str(p): arduino_port = str(p)[:12] ser=s.Serial(arduino_port, .9600) time.sleep(3) #set language, first from command line, otherwise from config.txt if(len(sys.argv) > 1): language = sys.argv[1] else: language = default_lang #send language to Arduino ser.write("{LANGUAGE" + langdict[language] + language + "}") #send default display state to Arduino ser.write("{DEFAULT" + launch_state + "}") #setup default display info if(launch_state == "subtitles"): write_subtitles = True else: write_subtitles = False #setup button GPIO.setmode(GPIO.BCM) GPIO.setup(21, GPIO.IN, pull_up_down=GPIO.PUD_UP) #GPIO21 = pin 40 GPIO.add_event_detect(21, GPIO.FALLING, callback=next_language, bouncetime=200) #start omxplayer cmd = "omxplayer -o local --no-osd --loop %s" %(video) Popen([cmd], shell=True) #start dbus done, retry = 0, 0 while done==0: try: with open('/tmp/omxplayerdbus.' + getpass.getuser(), 'r+') as f: omxplayerdbus = f.read().strip() bus = dbus.bus.BusConnection(omxplayerdbus) object = bus.get_object('org.mpris.MediaPlayer2.omxplayer','/org/mpris/MediaPlayer2', introspect=False) dbusIfaceProp = dbus.Interface(object,'org.freedesktop.DBus.Properties') dbusIfaceKey = dbus.Interface(object,'org.mpris.MediaPlayer2.Player') #disable in-player subtitles on video dbusIfaceKey.Action(dbus.Int32("30")) done=1 except: retry+=1 if retry >= 5000: print "ERROR" raise SystemExit #import SRT subtitle files into one "subtitles" dict subtitles = collections.OrderedDict() os.chdir(subtitle_directory) #first add default language for subs in glob.glob("*.srt"): lang = subs.split('.')[1] if(lang == default_lang): with io.open(subs, "r", encoding="utf-8-sig") as myfile: subfile = myfile.read() subtitle_generator = srt.parse(subfile) subtitles[lang] = list(subtitle_generator) #then add other languages for subs in glob.glob("*.srt"): lang = subs.split('.')[1] with io.open(subs, "r", encoding="utf-8-sig") as myfile: subfile = myfile.read() subtitle_generator = srt.parse(subfile) subtitles[lang] = list(subtitle_generator) #iterate through and print subtitles i = 0 next_i = 0 position = "0" duration = "0" while duration == "0": try: duration = chop_digits(str(dbusIfaceProp.Duration())) except: pass while long(duration) > long(position): start = tc_to_ms(str(subtitles[language][i].start)) end = tc_to_ms(str(subtitles[language][i].end)) position = chop_digits(str(dbusIfaceProp.Position())) #return to default if subtitle_duration 2 cases are met if subtitle_duration == "2" and write_subtitles == True and played_through_once == True: if long(position) > long(sub_start_position): write_subtitles = False ser.write("{DEFAULT" + launch_state + "}") if long(position) > long(start) and long(position) <= long(end): if i > next_i: next_i += 1 elif i == next_i: #requires write_subtitles = True to account for launch_state if write_subtitles == True: ser.write(subtitles[language][i].content.encode("utf-8") + "\r") next_i += 1 elif long(position) > long(end): if subtitles[language][i] == subtitles[language][-1]: i = 0 next_i = 0 if launch_state != "subtitles" and write_subtitles == True: if(subtitle_duration == "1" or (subtitle_duration == "3" and played_through_once == True)): #return dipslay to default state write_subtitles = False played_through_once = False language = default_lang ser.write("{DEFAULT" + launch_state + "}") elif(subtitle_duration == "3" and played_through_once == False): played_through_once = True elif(subtitle_duration == "2"): played_through_once = True #update .count_plays (number of playthroughs) with open(abs_path + '.count_plays', 'r') as count_video: value = int(count_video.read()) with open(abs_path + '.count_plays', 'w') as count_video: count_video.write(str(value + 1)) #write to /etc/motd update_dashboard() else: i += 1 elif long(position) == 0: i = 0 next_i = 0
def test_parsing_spaced_arrow(subs): spaced_block = srt.compose(subs, reindex=False, strict=False).replace("-->", "- >") reparsed_subtitles = srt.parse(spaced_block) subs_eq(reparsed_subtitles, subs)
def test_compose_and_parse_strict_custom_eol(input_subs, eol): composed = srt.compose(input_subs, reindex=False, eol=eol) reparsed_subs = srt.parse(composed) subs_eq(reparsed_subs, input_subs)
def test_compose_and_parse_strict(input_subs): composed = srt.compose(input_subs, reindex=False) reparsed_subs = srt.parse(composed) subs_eq(reparsed_subs, input_subs)
def test_can_compose_without_eol_at_all(input_subs): composed = srt.compose(input_subs, reindex=False) composed_without_ending_blank = composed.rstrip('\r\n') reparsed_subs = srt.parse(composed_without_ending_blank) subs_eq(reparsed_subs, input_subs)
def test_compose_and_parse_from_file(input_subs): srt_file = StringIO(srt.compose(input_subs, reindex=False)) reparsed_subs = srt.parse(srt_file) subs_eq(reparsed_subs, input_subs)