def convert_sup_to_srt(filename, file_info): column = 2 try: (headers, group) = get_gformat_subs(filename) for language in headers: subs = pysrt.SubRipFile() column = column + 1 tag = language.replace(" ", "_").decode('ascii', 'ignore') for line in group: if (len(line[column]) > 1): current_sub = pysrt.SubRipItem() current_sub.start = line[0].replace(',', '.') current_sub.end = line[1].replace(',', '.') current_sub.text = line[column].decode('utf-8') subs.append(current_sub) subs.save('temp.vtt') new_filename = 'live/subtitles/' + os.path.splitext( os.path.basename(filename))[0] + tag + '.vtt' os.system('echo WEBVTT > ' + new_filename) os.system('cat temp.vtt >> ' + new_filename) shortname = os.path.splitext(os.path.basename(filename))[0] if (shortname in prog_dict): if (prog_dict[shortname] < len(subs)): prog_dict[shortname] = len(subs) else: prog_dict[shortname] = len(subs) file_info.append([shortname, len(subs), new_filename, language]) print(new_filename) except AttributeError: # We would expect this to be because we've been handed a file that's outside our type # TODO: we should identify exactly where this error appears for # various types of tests pass
def import_from_srt(subtitulation, vars): import pysrt import StringIO # Create the srt object mysrt = pysrt.SubRipFile(encoding=ENCODING) srtinput = [unicode(line, ENCODING) for line in \ vars.source.file.read().splitlines()] # mysrt.read(vars.source.file) mysrt.read(srtinput) result = dict(removed=0, inserted=0, errors=[]) if vars.overwrite: # Delete any existent subtitle result["removed"] = db( db.subtitle.subtitulation_id == subtitulation.id).count() db(db.subtitle.subtitulation_id == subtitulation.id).delete() for subtitle in mysrt: body = subtitle.text try: starts = subtitle.start.to_time() ends = subtitle.end.to_time() except ValueError: result["errors"].append(T("Invalid time input: %(start)s - %(end)s", lazy=False) % \ dict(start=subtitle.start, end=subtitle.end)) db.subtitle.insert(subtitulation_id=subtitulation.id, body=body, starts=starts, ends=ends) result["inserted"] += 1 return result
def srt_formatter(subtitles, show_before=0, show_after=0): sub_rip_file = pysrt.SubRipFile() for i, ((start, end), text) in enumerate(subtitles, start=1): item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after sub_rip_file.append(item) return '\n'.join(six.text_type(item) for item in sub_rip_file)
def srt_formatter(subtitles, show_before=0, show_after=0): f = pysrt.SubRipFile() for (rng, text) in subtitles: item = pysrt.SubRipItem() item.text = force_unicode(text) start, end = rng item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after f.append(item) return '\n'.join(map(unicode, f))
def CreateCleanSubAndMuteList(self, cleanSubsFileSpec=None): subFileParts = os.path.splitext(self.inputSubsFileSpec) self.tmpSubsFileSpec = subFileParts[0] + "_utf8" + subFileParts[1] shutil.copy2(self.inputSubsFileSpec, self.tmpSubsFileSpec) UTF8Convert(self.tmpSubsFileSpec) if cleanSubsFileSpec is not None: self.cleanSubsFileSpec = cleanSubsFileSpec else: self.cleanSubsFileSpec = subFileParts[0] + "_clean" + subFileParts[ 1] lines = [] with open(self.swearsFileSpec) as f: lines = [line.rstrip('\n') for line in f] for line in lines: lineMap = line.split("|") if len(lineMap) > 1: self.swearsMap[lineMap[0]] = lineMap[1] else: self.swearsMap[lineMap[0]] = "*****" replacer = re.compile( r'\b(' + '|'.join(self.swearsMap.keys()) + r')\b', re.IGNORECASE) subs = pysrt.open(self.tmpSubsFileSpec) newSubs = pysrt.SubRipFile() for sub in subs: newText = replacer.sub(lambda x: self.swearsMap[x.group()], sub.text) if (newText != sub.text): newSub = sub newSub.text = newText newSubs.append(newSub) newSubs.save(self.cleanSubsFileSpec) newLines = [] for sub in newSubs: newLines.append([sub.start.to_time(), sub.end.to_time()]) self.muteTimeList = [] for timePair in newLines: lineStart = (timePair[0].hour * 60.0 * 60.0) + ( timePair[0].minute * 60.0) + timePair[0].second + ( timePair[0].microsecond / 1000000.0) lineEnd = (timePair[1].hour * 60.0 * 60.0) + ( timePair[1].minute * 60.0) + timePair[1].second + ( timePair[1].microsecond / 1000000.0) self.muteTimeList.append("volume=enable='between(t," + format(lineStart, '.3f') + "," + format(lineEnd, '.3f') + ")':volume=0")
def srt_formatter(subtitles, show_before=0, show_after=0): f = pysrt.SubRipFile() for i, (rng, text) in enumerate(subtitles, 1): item = pysrt.SubRipItem() item.index = i item.text = force_unicode(text) start, end, num = rng item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after f.append(item) return '\n'.join(six.text_type(item) for item in f)
def criarArquivoSRT(tempodalegenda, preenchimento_antes=0, preenchimento_depois=0): rip = pysrt.SubRipFile() for i, ((inicio, fim), text) in enumerate(tempodalegenda, start=1): item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, inicio - preenchimento_antes) item.end.seconds = fim + preenchimento_depois rip.append(item) legenda = '\n'.join(six.text_type(item) for item in rip) return legenda
def srt_formatter(subtitles, show_before=0, show_after=0): f = pysrt.SubRipFile() for i, (rng, text) in enumerate(subtitles, 1): item = pysrt.SubRipItem() item.index = i item.text = force_unicode(text) start = rng[0] end = rng[1] item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after f.append(item) return '\n'.join(map(str, f))
def auto_generate(self, widget, name): shutil.rmtree('./splitAudio') os.mkdir('./splitAudio') self.sound_file = AudioSegment.from_file(self.filename[8:]) self.len_file = len(self.sound_file) print("Length of track: " ,self.len_file/second, "seconds") self.sub_write_file = pysrt.SubRipFile(encoding='utf-8') self.sub_write_file.save(self.filename[8:-4] + ".srt", encoding='utf-8') self.gen = multiprocessing.Process(target = self.start_generate, args=()) self.gen.start() self.auto_generate_subtitles = thread.start_new_thread(self.show_generated, ()) return
def xml_to_srt(xml_data): """ xml_data - ET Converts XML data received from Google's servers and returns a SubRipFile instance. """ f = pysrt.SubRipFile() for child in xml_data: sub = pysrt.SubRipItem() sub.text = h.unescape(child.text) sub.start.seconds = float(child.attrib["start"]) sub.end.seconds = float(child.attrib["start"]) + float(child.attrib["dur"]) f.append(sub) return f
def srt_formatter(subtitles, padding_before=0, padding_after=0): """ Serialize a list of subtitles according to the SRT format, with optional time padding. """ sub_rip_file = pysrt.SubRipFile() for i, ((start, end), text) in enumerate(subtitles, start=1): item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, start - padding_before) item.end.seconds = end + padding_after sub_rip_file.append(item) return '\n'.join(six.text_type(item) for item in sub_rip_file)
def join_srt_files(srt_top, srt_btm, srt_out): """Join two subtitles and save result. """ top = pysrt.open(srt_top) btm = pysrt.open(srt_btm) merged = pysrt.SubRipFile(items=btm) for item in top: item.text = TOP_SRT_TEMPLATE.format(item.text) merged.append(item) merged.sort() merged.clean_indexes() merged.save(srt_out)
def generate(self, subtitles, show_before=0, show_after=0, *args, **kwargs) -> str: sub_rip_file = pysrt.SubRipFile() for i, ((start, end), text) in enumerate(subtitles, start=1): item = pysrt.SubRipItem() item.index = i item.text = str(text) item.start.seconds = max(0, start - show_before) item.end.seconds = end + show_after sub_rip_file.append(item) return '\n'.join(str(item) for item in sub_rip_file)
def make_subtitles(frames_time, frames_annotation, user_id): file = pysrt.SubRipFile(encoding='utf-8') length = len(frames_time) for i in range(length - 1): sub = pysrt.SubRipItem() sub.index = frames_time[i][0] + 1 sub.start.seconds = frames_time[i][1] sub.end.seconds = frames_time[i + 1][1] sub.text = frames_annotation[i][1] file.append(sub) sub = pysrt.SubRipItem() sub.index = frames_time[length - 1][0] + 1 sub.start.seconds = frames_time[length - 1][1] sub.text = frames_annotation[length - 1][1] file.append(sub) file.save(MAIN_DIRECTORY + '%d/subtitles.srt' % user_id)
def write_transcripts(transcript_filename, transcript, reg): print(transcript) import six sub_rip = pysrt.SubRipFile() for i, (start, end), text in zip(range(len(transcript)), reg, transcript): print(i, start, end, text) item = pysrt.SubRipItem() item.index = i item.text = six.text_type(text) item.start.seconds = max(0, start) item.end.seconds = end sub_rip.append(item) fin_sub = '\n'.join(six.text_type(item) for item in sub_rip) with open(output_filepath + transcript_filename, "wb") as f: f.write(fin_sub.encode("utf-8")) print("+ Successfully Generated Subtitles.") return True
def combine_srt(srt_list): """ srt_list - a list of SubRipFiles Combines the text of all SubRipFiles in srt_list and returns a SubRipFile instance. """ if srt_list is None or len(srt_list) == 0: return None f = pysrt.SubRipFile() for index in xrange(len(srt_list[0])): sub = pysrt.SubRipItem() for srt in srt_list: sub.text += (srt[index].text + "\n") sub.text = sub.text.rstrip() sub.start = srt_list[0][index].start sub.end = srt_list[0][index].end f.append(sub) return f
def execute(self, context): scene = context.scene edit_channel = scene.subtitle_edit_channel fps = scene.render.fps / scene.render.fps_base original_start = scene.frame_start original_end = scene.frame_end all_strips = list( sorted(scene.sequence_editor.sequences_all, key=lambda x: x.frame_start)) text_strips = [] for x in range(len(all_strips)): if (all_strips[x].type == "TEXT" and all_strips[x].channel == edit_channel): text_strips.append(all_strips[x]) wav_path = os.path.join(os.path.dirname(__file__), 'temp.wav') txt_path = os.path.join(os.path.dirname(__file__), 'temp.txt') srt_path = os.path.join(os.path.dirname(__file__), 'temp.srt') subs = pysrt.SubRipFile() for i in range(len(text_strips)): frame_start = text_strips[i].frame_start frame_end = text_strips[i].frame_final_end - 1 start = (frame_start + 1) / fps text = text_strips[i].text scene.frame_start = frame_start scene.frame_end = frame_end bpy.ops.sound.mixdown(filepath=wav_path, container="WAV", codec="PCM") write_word_level(text, txt_path) subs.extend(make_subs(wav_path, txt_path, srt_path, start)) subsutils.addSubs(context, subs, use_color=True) return {"FINISHED"}
def export_to_srt(subtitulation): subtitles = db(db.subtitle.subtitulation_id == subtitulation.id).select( orderby=db.subtitle.starts) import pysrt import StringIO sio = StringIO.StringIO() mysrt = pysrt.SubRipFile(encoding=ENCODING) for i, subtitle in enumerate(subtitles): sri = pysrt.SubRipItem() if isinstance(subtitle.body, unicode): print "is unicode" sri.text = subtitle.body else: print "is not unicode" sri.text = unicode(subtitle.body, ENCODING) sri.start = pysrt.SubRipTime.from_time(subtitle.starts) sri.end = pysrt.SubRipTime.from_time(subtitle.ends) sri.index = i mysrt.append(sri) mysrt.write_into(sio) sio.seek(0) return sio
def split_subtitles(srt_file, invert_commercials, out_file): subs = pysrt.open(srt_file) parts = [] prev = 0.0 shift = 0 for c in invert_commercials: shift = shift - float(c[0]) + prev s = [] for i in subs.data: if i.start >= to_time(c[0]) and (c[1] is None or i.start < to_time(c[1])): temp = copy(i) time = to_time(shift) temp.shift(hours=time['hours'], minutes=time['minutes'], seconds=time['seconds'], milliseconds=time['milliseconds']) parts.append(temp) else: pass prev = c[1] if c[1] is not None else -1 subs = pysrt.SubRipFile(items=parts) subs.save(out_file)
def merge_sub(sub1, sub2, bar, driver): if space_var.get() == 1: space_sub = '\n \n' else: space_sub = '\n' sub1_df = dataframe_sub(sub1, "en") sub2_df = dataframe_sub(sub2, "ru") df = pd.concat([sub1_df, sub2_df], axis=0) df['sum'] = df[['start', 'end']].sum(axis=1) df['plus'] = (df['start'] + df['end']) / 2 df = df.sort_values(by='start', ascending=True) # агломеративная кластеризация if clusters_auto_var.get() == 1: clusters_list = [] # оценка качества с помощью "силуэта" silhouette = [] for i in np.linspace(0.2, 1, 20): root.update() threshold = float(i) * 10000 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold).fit(df[['start', 'end']]) clusters = clustering.labels_ clusters_list.append(len(pd.unique(clusters))) score = silhouette_score(df[['start', 'end']], clusters) silhouette.append(score) max_silhouette = np.argmax(silhouette) clustering = AgglomerativeClustering( n_clusters=clusters_list[max_silhouette]).fit(df[['start', 'end']]) else: threshold = float(clusters_manual_entry.get()) * 10000 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold, linkage=clusters_method_combobox.get()).fit(df[['start', 'end']]) clusters = clustering.labels_ # добавление найденных кластеров df['cluster'] = clusters bar_subs = float(bar) / float(len(pd.unique(clusters))) # создание нового файла субтитров double_sub = pysrt.SubRipFile(encoding='utf-8') translate_list = pysrt.SubRipFile(encoding='utf-8') for n, i in enumerate(pd.unique(clusters)): root.update() progressBar['value'] += bar_subs df_en = df[(df['language'] == 'en') & (df['cluster'] == i)] df_ru = df[(df['language'] == 'ru') & (df['cluster'] == i)] df_group_en = df_en.groupby('cluster').agg({ 'text': ' '.join, 'start': min, 'end': max, 'language': 'first' }) df_group_ru = df_ru.groupby('cluster').agg({ 'text': ' '.join, 'start': min, 'end': max, 'language': 'first' }) df_group = df_group_en.merge( df_group_ru, on=['cluster', 'text', 'start', 'end', 'language'], how='outer').groupby('cluster').agg({ 'text': space_sub.join, 'start': 'first', 'end': 'first', 'language': ''.join }) sub = pysrt.SubRipItem(index=n + 1, start=int(df_group.iloc[0]['start']), end=int(df_group.iloc[0]['end']), text=str(df_group.iloc[0]['text'])) double_sub.append(sub) if translate_var.get() == 1 and df_group['language'].values == 'en': translate_list.append(sub) if translate_var.get() == 1 and translate_list: translate_sub(translate_list, bar, driver) # переиндексация субтитров double_sub.clean_indexes() return double_sub
args = parser.parse_args() # Set punctuation that we are looking for srtRawPath = args.srtRawPath inputName = args.inputName pathRealLine = args.pathRealLine outputSRT = args.outputSRT END_PUNCTUATION = '.?!' # Read files and debugging files srtOriginal = pysrt.open(srtRawPath+inputName+'.srt', encoding='utf-8')#, encoding='iso-8859-1' fileLine = pathRealLine+inputName+'_linea.txt' linesIterator = open(fileLine, encoding='utf-8') srtTransformed = pysrt.SubRipFile() srtTransformedName = outputSRT+inputName + '.srt' srtTransformedName1 = outputSRT+inputName + '_trans1.srt' srtTransformedName2 = outputSRT+inputName + '_trans2.srt' alertNumLines = open(outputSRT+inputName+"_alerta.txt",'w') # Number of lines in per-line file should be the same as the number of lines calculated in SRT-raw numLinesOrig = sum(1 for line in linesIterator) numSubs = len(srtOriginal) first_sub = srtOriginal[0] prevText = '' prevEnd = first_sub.end prevStart = first_sub.start
def find_summary_regions(srt_filename, summarizer, duration, language, bonusWords, stigmaWords, videonamepart): srt_file = pysrt.open(srt_filename) # Find the average amount of time required for each subtitle to be showned clipList = list(map(srt_item_to_range, srt_file)) avg_subtitle_duration = total_duration_of_regions(clipList) / len(srt_file) # Find the no of sentences that will be required in the summary video n_sentences = duration / avg_subtitle_duration print("nsentance : " + str(n_sentences)) # get the summarize video's subtitle array [summary, summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords) # Check whether the total duration is less than the duration required for the video total_time = total_duration_of_regions(summary) print("total_time : " + str(total_time)) try_higher = total_time < duration prev_total_time = -1 # If the duration which we got is higher than required if try_higher: # Then until the resultant duration is higher than the required duration run a loop in which the no of sentence is increased by 1 while total_time < duration: if (prev_total_time == total_time): print("1 : Maximum summarization time reached") break print("1 : total_time : duration " + str(total_time) + " " + str(duration)) n_sentences += 1 [summary, summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords) prev_total_time = total_time total_time = total_duration_of_regions(summary) else: # Else if the duration which we got is lesser than required # Then until the resultant duration is lesser than the required duration run a loop in which the no of sentence is increased by 1 while total_time > duration: if (n_sentences <= 2): print("2 : Minimum summarization time reached") break print("2 : total_time : duration " + str(total_time) + str(duration)) n_sentences -= 1 [summary, summarizedSubtitles] = summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords) total_time = total_duration_of_regions(summary) print("************ THis is summary array *********") print(summary) print("**********************************") print( "************************THis is summarizedSubtitles array *******************" ) print(summarizedSubtitles) print("**********************************************************") # Find the duration of each subtitle and add it to the ending time of the previous subtitle subs = [] starting = 0 sub_rip_file = pysrt.SubRipFile() for index, item in enumerate(summarizedSubtitles): newSubitem = pysrt.SubRipItem() newSubitem.index = index newSubitem.text = item.text # First find duration duration = summary[index][1] - summary[index][0] # Then find the ending time ending = starting + duration newSubitem.start.seconds = starting newSubitem.end.seconds = ending sub_rip_file.append(newSubitem) # subs.append((index,starting,ending,item.text)) starting = ending print(sub_rip_file) # print(subs) path = videonamepart + ".srt" with open(path, "w+") as sf: for i in range(0, len(sub_rip_file)): sf.write(str(sub_rip_file[i])) sf.write("\n") sf.close() #test file for finding emotions # path = "./media/documents/summarizedSubtitleText.txt" # with open(path,"w+") as stf: # for i in range(0,len(summarizedSubtitles)): # stf.write(str(summarizedSubtitles[i].text)) # stf.write("\n") # stf.close() # return the resulant summarized subtitle array return summary
def CreateCleanSubAndMuteList(self): if (self.inputSubsFileSpec is None) or (not os.path.isfile(self.inputSubsFileSpec)): raise IOError( errno.ENOENT, f"Input subtitle file unspecified or not found ({os.strerror(errno.ENOENT)})", self.inputSubsFileSpec, ) subFileParts = os.path.splitext(self.inputSubsFileSpec) self.tmpSubsFileSpec = subFileParts[0] + "_utf8" + subFileParts[1] shutil.copy2(self.inputSubsFileSpec, self.tmpSubsFileSpec) UTF8Convert(self.tmpSubsFileSpec) if not self.cleanSubsFileSpec: self.cleanSubsFileSpec = subFileParts[0] + "_clean" + subFileParts[1] if not self.edlFileSpec: cleanSubFileParts = os.path.splitext(self.cleanSubsFileSpec) self.edlFileSpec = cleanSubFileParts[0] + '.edl' lines = [] with open(self.swearsFileSpec) as f: lines = [line.rstrip('\n') for line in f] for line in lines: lineMap = line.split("|") if len(lineMap) > 1: self.swearsMap[lineMap[0]] = lineMap[1] else: self.swearsMap[lineMap[0]] = "*****" replacer = re.compile(r'\b(' + '|'.join(self.swearsMap.keys()) + r')\b', re.IGNORECASE) subs = pysrt.open(self.tmpSubsFileSpec) newSubs = pysrt.SubRipFile() newTimestampPairs = [] # for each subtitle in the set # if text contains profanity... # OR if the next text contains profanity and lies within the pad ... # OR if the previous text contained profanity and lies within the pad ... # then include the subtitle in the new set prevNaughtySub = None for sub, subPeek in pairwise(subs): newText = replacer.sub(lambda x: self.swearsMap[x.group()], sub.text) newTextPeek = ( replacer.sub(lambda x: self.swearsMap[x.group()], subPeek.text) if (subPeek is not None) else None ) # this sub contains profanity, or if ( (newText != sub.text) or # we have defined a pad, and ( (self.swearsPadMillisec > 0) and (newTextPeek is not None) and # the next sub contains profanity and is within pad seconds of this one, or ( ( (newTextPeek != subPeek.text) and ((subPeek.start.ordinal - sub.end.ordinal) <= self.swearsPadMillisec) ) or # the previous sub contained profanity and is within pad seconds of this one ( (prevNaughtySub is not None) and ((sub.start.ordinal - prevNaughtySub.end.ordinal) <= self.swearsPadMillisec) ) ) ) ): subScrubbed = newText != sub.text newSub = sub newSub.text = newText newSubs.append(newSub) if subScrubbed: prevNaughtySub = sub newTimes = [ pysrt.SubRipTime.from_ordinal(sub.start.ordinal - self.swearsPadMillisec).to_time(), pysrt.SubRipTime.from_ordinal(sub.end.ordinal + self.swearsPadMillisec).to_time(), ] else: prevNaughtySub = None newTimes = [sub.start.to_time(), sub.end.to_time()] newTimestampPairs.append(newTimes) else: if self.fullSubs: newSubs.append(sub) prevNaughtySub = None newSubs.save(self.cleanSubsFileSpec) self.muteTimeList = [] edlLines = [] for timePair in newTimestampPairs: lineStart = ( (timePair[0].hour * 60.0 * 60.0) + (timePair[0].minute * 60.0) + timePair[0].second + (timePair[0].microsecond / 1000000.0) ) lineEnd = ( (timePair[1].hour * 60.0 * 60.0) + (timePair[1].minute * 60.0) + timePair[1].second + (timePair[1].microsecond / 1000000.0) ) self.muteTimeList.append( "volume=enable='between(t," + format(lineStart, '.3f') + "," + format(lineEnd, '.3f') + ")':volume=0" ) if self.edl: edlLines.append(f"{format(lineStart, '.1f')}\t{format(lineEnd, '.3f')}\t1") if self.edl and (len(edlLines) > 0): with open(self.edlFileSpec, 'w') as edlFile: for item in edlLines: edlFile.write(f"{item}\n")
def process(self): gentle_file = self.jsonfile out_file = self.outfile srt_file = self.srtfile g = json.load(open(gentle_file, 'r')) t = g['transcript'] g_words = [ w for w in g['words'] if w['case'] != 'not-found-in-transcript' ] sentences = t.split('\n') sentences = [sent.replace('-', ' ') for sent in sentences] inputsrt_elems = pysrt.open(srt_file) assert (len(inputsrt_elems) == len(sentences)) srt_elems = pysrt.SubRipFile() counter = 0 for sent_i, sent in enumerate(sentences): if type(sent) != type(u''): sent = sent.decode('utf-8') words = sent.split() start_time_found = False for cur_word in words: if not re.search(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE): continue for w in re.finditer(r'(\w|\’\w|\'\w)+', cur_word, re.UNICODE): word = w.group() gentle_word = g_words[counter] clean_word = re.search(r'(\w|\’\w|\'\w)+', word, re.UNICODE).group() if False: if (clean_word.lower() != gentle_word['word'].lower()): pdb.set_trace() print("Error") else: assert ( clean_word.lower() == gentle_word['word'].lower()) if gentle_word['case'] == 'success': #Retain first valid time boundary if start_time_found == False: start_time = gentle_word['start'] start_time_found = True #keep scanning until the last valid time bounday end_time = gentle_word['end'] counter += 1 if start_time_found == False: start_time = inputsrt_elems[sent_i].start end_time = inputsrt_elems[sent_i].end else: start_time = extract_time_tuple(start_time) end_time = extract_time_tuple(end_time) elem = pysrt.SubRipItem() elem.index = sent_i + 1 elem.text = sent elem.start = start_time elem.end = end_time srt_elems.append(elem) srt_elems.save(out_file, encoding='utf-8')
audio = r.record(source) text = r.recognize_sphinx(audio) file = pysrt.open('my_srt.srt', encoding='utf-8') sub = pysrt.SubRipItem() sub.index = counter counter += 1 sub.start.milliseconds = start_chunk + splits[i] sub.end.milliseconds = start_chunk + splits[i + 1] sub.text = text file.append(sub) file.save('my_srt.srt', encoding='utf-8') print(text) shutil.rmtree('./splitAudio') os.mkdir('./splitAudio') sound_file = AudioSegment.from_file(sys.argv[1], "mp4") len_file = len(sound_file) print("Length of track: ", len_file / second, "seconds") file = pysrt.SubRipFile(encoding='utf-8') file.save('my_srt.srt', encoding='utf-8') chunk_end = chunk_size while (chunk_end < len_file): chunk_file = sound_file[chunk_end - chunk_size:chunk_end] do_subtitles_generation(chunk_file, chunk_end - chunk_size) chunk_end += chunk_size do_subtiles_generation(sound_file[chunk_end - chunk_size:], chunk_end - chunk_size)
html = driver.page_source soup = BeautifulSoup(html.encode('utf-8'), features='html.parser') maindiv = soup.findAll("div", {"id": "show"})[0] basicdiv = maindiv.findAll("b") break except: # html = driver.find_element_by_tag_name('html') time.sleep(1) driver.refresh() submissing = False if html.find( "Sorry, there are no subtitle available for this video." ) != -1: file = pysrt.SubRipFile() sub = pysrt.SubRipItem(1, start='00:00:00,000', end='00:00:01,000', text="Sub was not found") file.append(sub) subtype = "F-" file.save("H:\#Everything Else\#Project Ashwini\SRT\\" + channelName + "\\" + subtype + channelName + "-" + str(videolinknum) + ".srt", encoding='utf-8') submissing = True print("Sub missing") elif len(basicdiv) <= 1: file = pysrt.SubRipFile()
def CreateCleanSubAndMuteList(self, cleanSubsFileSpec=None): subFileParts = os.path.splitext(self.inputSubsFileSpec) if cleanSubsFileSpec is not None: self.cleanSubsFileSpec = cleanSubsFileSpec subFileParts = os.path.splitext(self.cleanSubsFileSpec) self.cleanSubsNotModFileSpec = subFileParts[0] + "_all_not_cleaned" + subFileParts[1] else: #self.cleanSubsFileSpec = subFileParts[0] + "_clean" + subFileParts[1] subFileFirstParts = os.path.splitext(subFileParts[0]) self.cleanSubsFileSpec = subFileFirstParts[0] + ".clean" + subFileFirstParts[1] + ".forced" + subFileParts[1] #self.cleanSubsNotModFileSpec = subFileFirstParts[0] + ".all_not_cleaned" + subFileFirstParts[1] + subFileParts[1] self.cleanSubsNotModFileSpec = subFileFirstParts[0] + '.clean' + subFileFirstParts[1] + subFileParts[1] if os.path.isfile(self.inputSubsFileSpec): shutil.copyfile(self.inputSubsFileSpec, subFileFirstParts[0] + '.orig' + subFileFirstParts[1] + subFileParts[1]) # remove brackets that interfere with ffmpeg subtitles filter self.cleanSubsFileSpec = self.cleanSubsFileSpec.translate({ord(x): '' for x in ['[',']']}) lines = [] with open(self.swearsFileSpec) as f: lines = [line.rstrip('\n') for line in f] for line in lines: lineMap = line.split("|") if len(lineMap) > 1: self.swearsMap[lineMap[0]] = lineMap[1] else: self.swearsMap[lineMap[0]] = "*****" replacer = re.compile(r'\b(' + '|'.join(self.swearsMap.keys()) + r')\b', re.IGNORECASE) blob = open(self.inputSubsFileSpec, 'rb').read() m = magic.open(magic.MAGIC_MIME_ENCODING) m.load() encoding = m.buffer(blob) subs = pysrt.open(self.inputSubsFileSpec, encoding=encoding) newSubs = pysrt.SubRipFile() newSubsNotMod = pysrt.SubRipFile() for sub in subs: newText = replacer.sub(lambda x: self.swearsMap[x.group()], sub.text) #print("old: "+sub.text+", new: "+newText) if (newText != sub.text): newSub = sub newSub.text = newText newSubs.append(newSub) #else: newSubsNotMod.append(sub) newSubs.save(self.cleanSubsFileSpec) newSubsNotMod.save(self.cleanSubsNotModFileSpec) newLines = [] for sub in newSubs: newLines.append([sub.start.to_time(), sub.end.to_time()]) self.muteTimeList = [] for timePair in newLines: lineStart = (timePair[0].hour * 60.0 * 60.0) + (timePair[0].minute * 60.0) + timePair[0].second + (timePair[0].microsecond / 1000000.0) lineEnd = (timePair[1].hour * 60.0 * 60.0) + (timePair[1].minute * 60.0) + timePair[1].second + (timePair[1].microsecond / 1000000.0) self.muteTimeList.append("volume=enable='between(t," + format(lineStart, '.3f') + "," + format(lineEnd, '.3f') + ")':volume=0")