def readAllFiles(path): for root, dirs, files in os.walk(path): files.sort() for fileread in files: if fileread.endswith(".scenes"): with open(path + fileread) as a_file: lines = a_file.readlines() fileName = lines[0].rstrip() del lines[0] number = 1 listSrt = list() for item in lines: lhs, rhs = item.split(",", 1) partName, ext = fileName.split(".", 1) newPath = path + 'parted/' if not os.path.exists(newPath): os.makedirs(newPath) newFileNameMovie = newPath + partName + '_' + str(number) + '.mp4' newFileNameSrt = newPath + partName + '_' + str(number) + '.srt' number += 1 # Split movie file # ffmpeg -i video.mp4 -ss 00:01:00 -to 00:02:00 -c copy cut.mp4 # ffmpeg -i input.avi -c:v libx264 -crf 19 -preset slow -c:a libfaac -b:a 192k -ac 2 out.mp4 try: if ext == 'mp4': os.system('ffmpeg -i "%s" -ss "%s" -to "%s" -c copy "%s" ' % ( path + fileName, lhs, rhs.rstrip(), newFileNameMovie)) else: os.system('ffmpeg -i "%s" -ss "%s" -to "%s" -c:v libx264 -c:a copy "%s" ' % ( path + fileName, lhs, rhs.rstrip(), newFileNameMovie)) except: print "Error with spliting movie file" # Split *.srt file try: #subs = SubRipFile.open(path + partName + '.srt') try: subs = pysrt.open(path + partName + '.srt') except UnicodeDecodeError: subs = pysrt.open(path + partName + '.srt',encoding='iso-8859-1') Hs, Ms, Ss = lhs.split(":", 2) He, Me, Se = rhs.split(":", 2) part = subs.slice(starts_after={'hours': int(Hs), 'minutes': int(Ms), 'seconds': int(Ss)}, ends_before={'hours': int(He), 'minutes': int(Me), 'seconds': int(Se)}) part.save(newFileNameSrt) listSrt.append(newFileNameSrt) # part.shift(hours=-int(Hs), minutes=-int(Ms), seconds=-int(Ss)) except: print "Error with spliting srt file" if not listSrt: print "Error there are no srt files" else: """ srtdir = path+'wholeSrt/' ensure_dir(srtdir) srtmerge(listSrt, srtdir + partName + '_new.srt', offset=1000) srtToTxt(path+'wholeSrt/') """ srtToTxt(newPath)
def _detect_subtitle_language(srt_path): log.debug('Detecting subtitle language') # Load srt file (try first iso-8859-1 with fallback to utf-8) try: subtitle = pysrt.open(path=srt_path, encoding='iso-8859-1') except Exception: try: subtitle = pysrt.open(path=srt_path, encoding='utf-8') except Exception: # If we can't read it, we can't detect, so return return None # Read first 5 subtitle lines to determine the language if len(subtitle) >= 5: text = '' for sub in subtitle[0:5]: text += sub.text # Detect the language with highest probability and return it if it's more than the required minimum probability detected_languages = langdetect.detect_langs(text) log.debug('Detected subtitle language(s): %s', detected_languages) if len(detected_languages) > 0: # Get first detected language (list is sorted according to probability, highest first) detected_language = detected_languages[0] language_probability = detected_language.prob if language_probability >= autosubliminal.DETECTEDLANGUAGEPROBABILITY: log.debug('Probability of detected subtitle language accepted: %s', detected_language) return Language.fromietf(detected_language.lang) else: log.debug('Probability of detected subtitle language too low: %s', detected_language) return None
def get_word_freq_dict(inputfiles, verification_list): freq_dict = {} for inputfile in inputfiles: print 'processing', inputfile subs = [] try: subs = pysrt.open(inputfile) except UnicodeDecodeError as e: subs = [] if not subs: for enc in ['utf8',"iso-8859-1"]: try: print 'trying with' , enc subs = pysrt.open(inputfile, encoding=enc) except UnicodeDecodeError as e: subs =[] if subs: break if not subs: print 'couldnt open ', inputfile continue for sub in subs: words = sub.text.split() for w in words: insert_word(freq_dict, w, verification_list) print len(freq_dict), sum(freq_dict.values()) return freq_dict
def collect_subtitles_lines(self, subtitle_file, file_path=None): if not file_path: file_path = os.path.join( subtitle_file.directory, subtitle_file.file_name) try: subs = pysrt.open(file_path) except UnicodeDecodeError: subs = pysrt.open( file_path, encoding='iso-8859-1') for sub in subs: start = str(datetime.timedelta( milliseconds=sub.start.ordinal)) end = str(datetime.timedelta( milliseconds=sub.end.ordinal)) text = sub.text try: line = SubtitlesLine.objects.create( subtitlefile=subtitle_file, index=sub.index, start=str(start), end=str(end), text=text, ) except (ValidationError, ValueError) as e: print 'Ignoring: {t}'.format(t=text.encode('utf8')) continue line.text_vector = SearchVector( 'text', config=subtitle_file.language) line.save()
def join_srt_files(srt_top, srt_btm, srt_out): """Join two subtitles and save result. """ top = pysrt.open(srt_top) btm = pysrt.open(srt_btm) merged = pysrt.SubRipFile(items=btm) for item in top: item.text = TOP_SRT_TEMPLATE.format(item.text) merged.append(item) merged.sort() merged.clean_indexes() merged.save(srt_out)
def srtToTxt(dirName): for infile in glob.glob(os.path.join(dirName, '*.srt')): # os.system("iconv --from-code=ISO-8859-1 --to-code=UTF-8 \"" + infile + "\" > \"" + infile2 + "\"") #subs = SubRipFile.open(infile) try: subs = pysrt.open(infile) except UnicodeDecodeError: subs = pysrt.open(infile,encoding='iso-8859-1') outfile = infile[:-4] + '.txt' f = codecs.open(outfile, "w", encoding="utf-8") #f = open(outfile, 'w') for i in range(len(subs)): f.write(subs[i].text) f.close()
def emptyEntries(myFile, keep, verbose): emptyEntryFound = False emptyEntries = 0 entriesToDelete = [] if verbose: print "--- Searching for empty entries" subs = pysrt.open(myFile, encoding='utf-8') # open sub with pysrt as utf-8 entries = len(subs) # count entries if verbose: print "--- %s entries total" % entries for entryNo in range(0, entries): # count entry numbers up to number of entries subEntry = u"%s" % subs[entryNo] # read single entry lines = subEntry.split('\n') # split entry into lines lineNo = 0 # set first line to 0 emptyEntry = False for row in lines: # read lines one by one if lineNo == 2: if (row == " " or row == " " or not row): # if third line is or empty emptyEntry = True if emptyEntry and lineNo == 3 and row == "": # if third line is and fourth line is empty emptyEntryFound = True emptyEntries += 1 entriesToDelete.append(entryNo) # add entry number to list lineNo += 1 if emptyEntryFound: # if empty entry is found print "*** %s empty entries found" % emptyEntries for entryNo in reversed(entriesToDelete): # run through entry numbers in reverse # print lineNo del subs[entryNo] # delete entry if keep: if verbose: print "--- Copying original file to %s.emptyEntries" % myFile copyfile(myFile, "%s.emptyEntries" % myFile) subs.save(myFile, encoding='utf-8') # save sub subs = pysrt.open(myFile, encoding='utf-8') # open new sub with pysrt entries = len(subs) # count entries print "--- Now has %s entries" % entries return emptyEntryFound
def extract_lines(subtitle_path): try: subtitle_object = pysrt.open(subtitle_path) except UnicodeDecodeError: subtitle_object = pysrt.open(subtitle_path, encoding='latin1') subtitle_lines = [] for sub in subtitle_object: text = sub.text # Removing any formatting via HTML tags text = re.sub('<[^<]+?>', '', text) # Skipping links (usually ads or subtitle credits so irrelevant) if re.search(URL_REGEX, text): continue subtitle_lines.append(text) return subtitle_lines
def parseSubs(subtitles): for filename in os.listdir(SUB_PATH_BASE_DIR): print ("Parsing srt file: " + filename) try: subs = pysrt.open(SUB_PATH_BASE_DIR+filename) except: print "Could not parse "+ filename continue for i in xrange(len(subs)): sub = subs[i] if i != len(subs)-1: # some subbers are crazy impatient! subs drop out prematurely # given a threshold for about 2 seconds, we will extend the sub up to # 1 second based on the start time of the next subtitle nextSub = subs[i+1] timeToNextSub = nextSub.start - sub.end secondsToNextSub = timeToNextSub.seconds + timeToNextSub.milliseconds/1000.0 if secondsToNextSub <= 2: sub.end.seconds += secondsToNextSub/2.0 else: sub.end.seconds += 1 if (len(sub.text.split()) == 0): continue CurrentSubtitle = Subtitle(sub, filename) subtitles.append(CurrentSubtitle)
def _read_subtitle(self, subtitle_filename): """Read the subtitle file and output dialogs. """ subtitle_text = pysrt.open(subtitle_filename, encoding='iso-8859-1') subtitle_text = [l.strip() for l in subtitle_text.text.split('\n')] subtitle_text = [quote_matches.sub('', l).strip() for l in subtitle_text] # Prepare dialogs dialogs = [] create_new_dialog = True for l in subtitle_text: if not l: # Get rid of newlines continue if create_new_dialog: dialogs.append([l]) # Start new dialog else: dialogs[-1].append(l) # Append to last dialog # Decide what to do with next line based on current line ending create_new_dialog = False if l[-1] in ['.', '!', '?', ':', ')']: create_new_dialog = True # Join the lists to form single dialogs for d in range(len(dialogs)): dialogs[d] = ' '.join(dialogs[d]) return dialogs
def parse_srt(sub_file=None): sub_file = sub_file or get_random_srt() debug(u"Using {} as SRT".format(sub_file)) try: subs = srt.open(sub_file) except: subs = srt.open(sub_file, "latin1") flat_subs = subs.text.replace("\n", " ") clean_subs = junk_re.sub(" ", flat_subs) piece_iter = iter(split_re.split(clean_subs)) split_subs = [l+next(piece_iter, '').strip() for l in piece_iter] return split_subs
def handle_subtitle(cls, filename, target=None, to='zh', by_words=True): subs = pysrt.open(filename) words_list = cls.init_word_list() for sub in subs: if by_words: result = '' result_dict = BaiduTranslate.translate(sub.text.replace(' ', '\n')) for k in result_dict: if cls.is_word_valid(k, words_list): result += k + '(' + result_dict.get(k) + ') ' else: result += k + ' ' sub.text = result print(result) else: try: result = BaiduTranslate.translate(sub.text, to=to) except requests.exceptions.ReadTimeout: time.sleep(10) BaiduTranslate.log('HTTP TIME OUT : ' + sub.text) continue for r in result: sub.text += '\n' + result[r] subs.save(target or filename + '.' + to + '.srt') return True
def download_sub(self): print 'Validation: ' + str(self.validate) if self.validate: validate = Validate(self.movie_path) chain_iterators = chain(DBSub().download_sub(self.movie_path), OpenSubs().download_sub(self.movie_path)) for file_path in chain_iterators: if self.validate: subs = pysrt.open(file_path) text_slices = subs.slice(starts_after={'minutes': validate.start_min - 1, 'seconds': 59}, ends_before={'minutes': validate.start_min, 'seconds': 11}) text = '' for t_slice in text_slices.data: text = text + t_slice.text + ' ' text = ' '.join(text.split()) print("For file : {} Movie Text is : {}".format(file_path, text)) if validate.validate(text): print("Found validated subtitle") self._final(True) return os.remove(file_path) else: continue self._final(False)
def start(text_input,language_analysis_stimulated): #time.sleep(0.3) # Wait 0.5 seconds for other processes's start t0 = time.time() # Initiation time if os.path.exists(text_input): # If captions file exist subs = pysrt.open(text_input) # Get whole subtitles i = 0 # Step counter while i < len(subs): # While step counter less than amount of subtitles time.sleep(0.1) # Wait 0.5 seconds to prevent aggressive loop if (time.time() - t0 + 0.8) > subs[i].start.seconds: # If current time is greater than subtitle's start sub_starting_time = datetime.datetime.now() # Starting time of the memory language_analysis_stimulated.value = 1 # Language analysis stimulated sub_ending_time = sub_starting_time + datetime.timedelta(seconds=(subs[i].end - subs[i].start).seconds) # Calculate the ending time by subtitle's delta sub = subs[i].text.encode('ascii','ignore') # Encode subtitle's text by ascii and assign to sub variable sub = sub.translate(None, '!@#$?,') words = sub.split() phone_groups = [] for word in words: phone_groups.append(LanguageAnalyzer.word_to_phones(word)) phones = " ".join(phone_groups) phone_duration = datetime.timedelta(seconds=(subs[i].end - subs[i].start).seconds) / len(phones) starting_time = sub_starting_time for word_inphones in phone_groups: ending_time = starting_time + phone_duration * len(word_inphones.split()) if ending_time <= sub_ending_time and word_inphones != "": process5 = multiprocessing.Process(target=LanguageMemoryUtil.add_memory, args=(word_inphones, starting_time, ending_time)) # Define write memory process process5.start() # Start write memory process starting_time = ending_time + datetime.timedelta(milliseconds=50) print subs[i].text + "\n" # Print subtitle's text print phones + "\n" print "_____________________________________________________________________________________\n" language_analysis_stimulated.value = 0 # Language analysis NOT stimulated i += 1 # Increase step counter else: # If captions file doesn't exist raise ValueError('VTT file doesn\'t exist!') # Raise a ValueError
def cut_subtitle(self): sbt_in = self.subtitle_pick.get_text() if os.path.isfile(sbt_in): sbt_out = self.save_pick.get_text() + os.path.splitext(sbt_in)[1] h1, m1, s1 = self.start.get_h_m_s() h2, m2, s2 = self.stop.get_h_m_s() import chardet detected = chardet.detect(open(sbt_in, "rb").read(1024 * 1024)) enc = detected["encoding"] cnf = detected["confidence"] e = None encs = OrderedSet([enc, "utf-8", "latin1"]) for encoding in encs: try: logger.info("Trying to open subtitle with encoding %s" % encoding) subs = pysrt.open(sbt_in, error_handling=pysrt.ERROR_LOG, encoding=encoding) subtitle_cut(h1, m1, s1, h2, m2, s2, subs, sbt_out) return except Exception as ae: e = e or ae logger.warning("encoding %s failed", encoding, exc_info=1) msg = ( "Could not open {} with any of the following encodings:\n {}\n\n" "Confidence on {} was {}.\nFirst error was: {}" ) msg = msg.format(os.path.basename(sbt_in), ", ".join(encs), enc, cnf, str(e)) QMessageBox.warning(self, "Opening subtitle failed", msg, defaultButton=QMessageBox.NoButton) return sbt_out
def ldSrtFile(path): # check out the param subs = pysrt.open(path) if len(subs) == 0: return false else: return subs
def main(): """ This is the main method. It inits and executes the argparser. Loops over all subs """ parser = argparse.ArgumentParser( description='This scipt helps to customize ' 'the .srt file of a DJI Phantom 3.') parser.add_argument('-i', '--input', help='input .srt file', required=True) parser.add_argument('-o', '--output', help='output .srt file', required=True) parser.add_argument('-hb', '--barometer', action='store_true', help='add barometer height') parser.add_argument('-hu', '--ultrasonic', action='store_true', help='add ultrasonic height') parser.add_argument('-da', '--date', action='store_true', help='add date of flight') parser.add_argument('-ti', '--time', action='store_true', help='add time of flight') parser.add_argument('-du', '--duration', action='store_true', help='add duration of flight') parser.add_argument('-sp', '--speed', action='store_true', help='add speed') parser.add_argument('-l', '--label', action='store_true', help='add text label to each piece of data') args = parser.parse_args() if not (args.barometer or args.ultrasonic or args.date or args.time or args.duration or args.speed): print('Please specify some data to add!\n') parser.print_help() return -1 subs = pysrt.open(args.input) for s in subs: t = '' for token in re.split(' |\n', s.text): if args.barometer: t += filter_height(token=token, ultrasonic=False, use_label=args.label) if args.ultrasonic: t += filter_height(token=token, ultrasonic=True, use_label=args.label) if args.date: t += filter_date(token=token, use_label=args.label) if args.time: t += filter_time(token=token, use_label=args.label) if args.duration: t += compute_duration(token=token, use_label=args.label) if args.speed: t += compute_speed(token=token, use_label=args.label) s.text = t subs.save(args.output, encoding='utf-8') return 0
def parseSrt(file, overlap, output): subs = pysrt.open(file) entries = len(subs) fullText = subs[0].text.replace('\n', ' ') timeArray = '' if overlap: subs.shift(seconds=-3) time = getTimeInMilliseconds(subs[0].start) end = getTimeInMilliseconds(subs[0].end) difference = end - time text = subs[0].text.replace('\n', ' ') textArray = text.split(' ') if overlap: for x in xrange(0,len(textArray)): time += (difference / len(textArray)) timeArray += str(time) + ' ' print(timeArray) else: for x in xrange(0,len(textArray)): time += (difference / len(textArray)) timeArray += str(time) + ' ' # for all subtitle entries for x in xrange(1,entries): sub = subs[x] time = getTimeInMilliseconds(sub.start) end = getTimeInMilliseconds(sub.end) difference = end - time text = sub.text.replace('\n', ' ') textArray = text.split(' ') if overlap: fullText += ' ' + textArray[len(textArray) - 1] timeArray += str(end) + ' ' else: fullText += ' ' + text for x in xrange(0,len(textArray)): time += (difference / len(textArray)) timeArray += str(time) + ' ' # remove tailing whitespace timeArray = timeArray[:-1] print(len(fullText.split(' '))) print(len(timeArray.split(' '))) if output: text_file = open(output, "w") text_file.write(fullText.encode('UTF-8')) text_file.write('\n\n') text_file.write(timeArray) text_file.close()
def get_subtitles(srt_filename): episode_subtitles = pd.DataFrame(columns=['start', 'end', 'text']) subs = pysrt.open(srt_filename) subs.shift(seconds=0) for i, sub in enumerate(subs): episode_subtitles.loc[i] = [int(sub.start.minutes) * 60 + int(sub.start.seconds) + sub.start.milliseconds/1000, int(sub.end.minutes) * 60 + int(sub.end.seconds) + sub.end.milliseconds/1000, sub.text] return episode_subtitles
def loadSubtitles(self): if ".srt" in self.reader.currentFile.loadedExtensions: self.srt = pysrt.open(self.reader.currentFile.name+".srt") for x in self.srt: x.text = " ".join(x.text.split("\n")) self.srt.save(self.reader.currentFile.name+".srt",encoding="utf-8") entireText = "\n".join([i.text for i in self.srt]) #if self.reader.textContent.toPlainText() == "": self.reader.textContent.setPlainText(entireText)
def getSubtitles(srtFile): episodeSubtitles = pd.DataFrame(columns=['start','end','text']) subs = pysrt.open(srtFile) subs.shift(seconds=5) for i,sub in enumerate(subs): episodeSubtitles.loc[i] = [int(sub.start.minutes)*60+int(sub.start.seconds), int(sub.end.minutes) *60+int(sub.end.seconds), sub.text] return(episodeSubtitles)
def test_sentiment_api(): subs = pysrt.open("subtitles/" + 'S01E01' + ".srt") total_text = "" for sub in subs: total_text += sub.text.replace('\n', ' ').replace('<i>', '').replace('</i>', '').replace("\'", '').replace("?", ' ') form = dict(text="love love love") r = requests.post("http://text-processing.com/api/sentiment/", data=form) unicode_sentiment = r.text.encode("utf-8") print json.loads(r.text.encode("utf-8"))['probability']['neg']
def test_eol_conversion(self): input_file = open(self.windows_path, "rU", encoding="windows-1252") input_file.read() self.assertEqual(input_file.newlines, "\r\n") srt_file = pysrt.open(self.windows_path, encoding="windows-1252") srt_file.save(self.temp_path, eol="\n") output_file = open(self.temp_path, "rU", encoding="windows-1252") output_file.read() self.assertEqual(output_file.newlines, "\n")
def test_eol_conversion(self): input_file = open(self.windows_path, 'rU') input_file.read() self.assertEquals(input_file.newlines, '\r\n') srt_file = pysrt.open(self.windows_path, encoding='windows-1252') srt_file.save(self.temp_path, eol='\n') output_file = open(self.temp_path, 'rU') output_file.read() self.assertEquals(output_file.newlines, '\n')
def main(argv): parser = argparse.ArgumentParser(description='FFmpegConveter') parser.add_argument('-i', dest='input', required=True, help='input file', metavar='FILE') parser.add_argument('-s', dest='subtitlesSize', type=int , default=23) args = parser.parse_args() path = args.input subtitlesSize = args.subtitlesSize print 'Processing:\t', os.path.basename(path) commands = [] commands.append('ffmpeg') commands.append('-i "' + path + '"') commands.append('-vcodec h264') commands.append('-vprofile high') commands.append('-preset superfast') commands.append('-threads 0') commands.append('-acodec ac3') commands.append('-map 0:v:0') commands.append('-map 0:a') commands.append('-fs 2100000000') length = getLength(path) size = os.path.getsize(path) pathSRT = changeExtension(path, 'srt') if not os.path.isfile(pathSRT): if not helper.query_yes_no('Subtitles not found, continue?'): quit() crf = getCRF((size / length) / 1000) if crf > 0: commands.append('-crf ' + str(crf)) part = 1 lengthParts = 0 while lengthParts + (2 * part) < length: commandsT = list(commands) outputPath = createOutputPath(path, '_' + str(part)) outputPath = changeExtension(outputPath, 'mkv') if os.path.isfile(pathSRT): with tempfile.NamedTemporaryFile(dir='tmp', suffix='.srt', delete=False) as tmpfile: subs = pysrt.open(pathSRT) subs.shift(seconds=-lengthParts) subs.save(tmpfile.name) commandsT.append('-vf "subtitles=\'' + os.path.relpath(tmpfile.name).replace('\\', '\\\\') + '\':force_style=\'Fontsize=' + str(subtitlesSize) + '\'"') commandsT.insert(1, '-ss ' + str(lengthParts)) commandsT.append('"' + outputPath + '"') executeCommands(commandsT) lengthParts = lengthParts + getLength(outputPath) - 2 part = part + 1
def numbering(myFile, keep, verbose): wrongNumbering = False if verbose: print "--- Checking numbering" subs = pysrt.open(myFile, encoding='utf-8') # open sub with pysrt as utf-8 entries = len(subs) # count entries for entryNo in range(0, entries): # count entry numbers up to number of entries subEntry = "%s" % subs[entryNo] # read single entry lines = subEntry.split('\n') # split entry into lines if entryNo + 1 != int(lines[0]): # entry number does not match real numbering wrongNumbering = True print "*** Correcting numbering" copyfile(myFile, "%s.wrongNumbering" % myFile) break if wrongNumbering: targetFile = codecs.open(myFile, "w", prefEncoding) subs = pysrt.open("%s.wrongNumbering" % myFile, encoding='utf-8') # open sub with pysrt as utf-8 entries = len(subs) # count entries for entryNo in range(0, entries): # count entry numbers up to number of entries subEntry = "%s" % subs[entryNo] # read single entry lines = subEntry.split('\n') # split entry into lines noLines = len(lines) # number of lines in each entry for line in range(0, noLines): if line == 0: targetFile.write("%s\n" % str(entryNo + 1)) # print entryNo + 1 else: targetFile.write("%s\n" % lines[line]) # print lines[line] targetFile.close() if not keep: if verbose: print "--- Deleting %s.wrongNumbering" % myFile os.remove("%s.wrongNumbering" % myFile) return wrongNumbering
def find_gif(subsfile,query): subs = pysrt.open(subsfile) start = 0 duration = 0 print(query) for sub in subs: text = " ".join(sub.text.lower().split()) if query in text: start = datetime.strptime(str(sub.start),"%H:%M:%S,%f").strftime("%H:%M:%S") duration = (datetime.strptime(str(sub.end),"%H:%M:%S,%f")-datetime.strptime(str(sub.start),"%H:%M:%S,%f")).seconds break return start,duration
def read(self, path, uri=None, **kwargs): """Load .srt file as transcription Parameters ---------- path : str Path to .srt file Returns ------- subtitle : Transcription """ # load .srt file using pysrt subtitles = pysrt.open(path) # initial empty transcription transcription = Transcription(uri=uri) # keep track of end of previous subtitle prev_end = TStart # loop on each subtitle in chronological order for subtitle in subtitles: # convert start/end time into seconds start = self._timeInSeconds(subtitle.start) end = self._timeInSeconds(subtitle.end) # connect current subtitle with previous one # if there is a gap between them if start > prev_end: transcription.add_edge(prev_end, start) # raise an error in case current subtitle starts # before previous subtitle ends elif start < prev_end: raise ValueError('Non-chronological subtitles') # split subtitle in multiple speaker lines (only if needed) lines = self._split(subtitle.text) # loop on subtitle lines for line, start_t, end_t in self._duration(lines, start, end): transcription.add_edge(start_t, end_t, subtitle=line) prev_end = end transcription.add_edge(end, TEnd) self._loaded = {(uri, 'subtitle'): transcription} return self
def render(self, input_video_fname, input_subt_fname, output_video_fname): capture, writer = self._get_video_components(input_video_fname, output_video_fname) subts = pysrt.open(input_subt_fname) print "total %s frames" % self._frame_count for idx in range(self._frame_count): ret, frame = capture.read() if not ret: break print "\rframe %d" % idx, sys.stdout.flush() frame = self._put_subt(idx, frame, subts) writer.write(frame)
def _read_dvs(self, dvs_filename): """Read a DVS file. """ dvs_text = pysrt.open(dvs_filename, encoding='iso-8859-1') dvs_text = [l.strip() for l in dvs_text.text.split('\n')] dvs_text = [quote_matches.sub('', l).strip() for l in dvs_text] # Cleanup DVS (remove the DVS index and stuff in {}) for k in range(len(dvs_text)): dvs_text[k] = dvs_rep.sub('', dvs_text[k]).strip() dvs_text[k] = dvs_cur.sub('', dvs_text[k]).strip() return dvs_text
import nltk import json import pysrt from nltk.tag.stanford import POSTagger english_postagger = POSTagger('models/english-bidirectional-distsim.tagger', 'stanford-postagger.jar') from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() from nltk.tokenize import RegexpTokenizer toker = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True) from nltk.corpus import stopwords stop = stopwords.words('english') subs = pysrt.open('deneme.srt') ignore = -1 j=0 myObj=[] tok=[] f = open('kelimeler', 'wr') for (i,sub) in enumerate(subs): if ignore == i: ignore = -1 continue text = sub.text.strip() if not text[-1] in ".!?": text += " "+subs[i+1].text ignore = i+1 #print english_postagger.tag(text.split())[0][1] for j in english_postagger.tag(text.split()): #print i #print j[0] if ((j[0] not in stop) and (j[1] == "NN" or j[1] == "JJ" or j[1] == "VB")): start=str(sub.start)
if os.path.exists(os.path.join(current_path, "mod-et-emo-004.csv")): os.remove(os.path.join(current_path, "mod-et-emo-004.csv")) if len(sentences) == len(emotional_intensity): df = pd.DataFrame( data={ 'Sentences': sentences, 'Emotional Intensity': emotional_intensity, 'Polarity': polarity }) df.to_csv('mod-et-emo-004.csv', sep='\t', index=False) # ## mod-et-emo-005 ==> Association of the temporary emotional intensity in a video # In[86]: subs = pysrt.open(input_subtitle_path) x = subs[len(subs) - 1] [hour, minute, sec] = [x.end.hours, x.end.minutes, x.end.seconds] total_duration = hour * 3600000 + minute * 60000 + sec * 1000 text = "" for i in range(len(subs)): a = subs[i] text = text + " " + a.text words = clean_words(text) num_words = len(words) print("Total words: " + str(num_words)) words_per_duration = num_words / total_duration time_period = int(total_duration / num_period) print("Time period in milliseconds: " + str(time_period))
def main(): # Load the subtitles subs = pysrt.open('/home/juan/Videos/AMOR.es.srt', encoding='iso-8859-1') # Start the video capture cap = cv2.VideoCapture('/home/juan/Videos/AMOR.mp4') cv2.namedWindow('Original') cv2.namedWindow('Cropped') # Extract number of subtitles num_subs = len(subs) print('Num subtitles:', num_subs) # Extract video metadata height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) fps = cap.get(cv2.CAP_PROP_FPS) print('video resolution:', width, ' x ', height) print('video framerate:', fps) cv2.waitKey(0) for s_idx, sub in enumerate(subs): s = "{0:1d}, {1:02d}:{2:02d} to {3:02d}:{4:02d} {5:s}" text = cleanhtml(sub.text) print( s.format(s_idx, sub.start.minutes, sub.start.seconds, sub.end.minutes, sub.end.seconds, text)) while True: # capture next frame ret, frame = cap.read() if PROCESS_VIDEO: # resiz frame for faster processing frame = cv2.resize(frame, (0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_LINEAR) # detect faces and landmarjs bounding_boxes, landmarks = detect_faces(frame) # if only one face detected if bounding_boxes.shape[0] == 1: # extract the bounding box bb = bounding_boxes[0] x1, y1, x2, y2 = int(bb[0]), int(bb[1]), int(bb[2]), int(bb[3]) # crop the face cropped = frame[y1:y2, x1:x2] cv2.imshow('Cropped', cropped) # draw the bounding box and landmarks on original frame frame = show_bboxes(frame, bounding_boxes, landmarks) # Display the image cv2.imshow('Original', frame) # Read keyboard and exit if ESC was pressed k = cv2.waitKey(10) & 0xFF if k == 27: break # Release resources cap.release() cv2.destroyAllWindows()
import time import pysrt full=str() subs = pysrt.open("test.srt") l=len(subs) for k in range(0,5): for i in range((k*l)//5,((k+1)*l//5)): full=full+subs[i].text+"\n\n" print(full) print(subs.text)
def extract_subtitles(filename, text_only=True): subtitles = pysrt.open(filename) if text_only: return [subtitle.text for subtitle in subtitles] return subtitles
def lin_shift(srt): name, ext = os.path.splitext(srt) subs = pysrt.open(srt) subs.shift(seconds=2 * 60 + 26.452) # subs.shift(seconds=t3) subs.save(name + 'm' + ext)
import pysrt import sys import os import shutil reload(sys) sys.setdefaultencoding('utf-8') #Main Function if __name__ == '__main__': subs = pysrt.open(str(sys.argv[1]), encoding='iso-8859-1') #input srt file src = str(sys.argv[2]) #input source location frameRate = 30 #frame rate of the video #get the youtube link fileName = str(sys.argv[1]).split('-') fileName = fileName[1].split('.') fileName = fileName[0] for sub in subs: #get start and end times of subtitles startTime = int(sub.start.hours * 3600 + sub.start.minutes * 60 + sub.start.seconds) endTime = int(sub.end.hours * 3600 + sub.end.minutes * 60 + sub.end.seconds) #determine start and end frames numbers if startTime == 1:
import pysrt import math #subs1 = pysrt.open('cze.srt', encoding='iso-8859-1') #subs2 = pysrt.open('eng.srt', encoding='iso-8859-1') subs1 = pysrt.open('eng.srt', encoding='iso-8859-1') subs2 = pysrt.open('cze.srt', encoding='iso-8859-1') OFFSET = 1 for s1 in subs1: s1StartTime = s1.start.seconds + s1.start.minutes * 60 + s1.start.hours * 60 * 60 s1EndTime = s1.end.seconds + s1.end.minutes * 60 + s1.end.hours * 60 * 60 print s1.text.encode('iso-8859-1'), end = False for s2 in subs2: if end: break s2StartTime = s2.start.seconds + s2.start.minutes * 60 + s2.start.hours * 60 * 60 s2EndTime = s2.end.seconds + s2.end.minutes * 60 + s2.end.hours * 60 * 60 # Zacinaji pobliz startu s1 nejake tiutlky s2? #print math.fabs(s1StartTime - s2StartTime) if math.fabs(s1StartTime - s2StartTime) <= OFFSET: # Pokud ano..vypisuje: print '\t------\t', s2.text.encode('iso-8859-1') for foo in subs2:
def new_audio_trim(chunk): chunk_file = chunk['audio_chunk'].split('/')[-1] old_chunk_file = chunk['history'][1]['audio_chunk'].split('/')[-1] old_subtitle = chunk['history'][1]['subtitle'] new_subtitle = chunk['subtitle'] ext = '.' + chunk_file.split('.')[-1] old_ext = '.' + old_chunk_file.split('.')[-1] old_chunk_file_name = old_chunk_file.split('.')[0] chunk_file_name = chunk_file.split('.')[0] print('old', old_chunk_file) print('new', chunk_file) video_id = chunk['VideoTutorial'] folder_path = os.path.join(settings.MEDIA_ROOT, settings.VIDEO_PROCESSING_ROOT, video_id) chunk_directory = os.path.join(folder_path, CHUNKS_DIRECTORY) os.chdir(chunk_directory) fp = open(CHUNKS_LIST_FILE_NAME, 'r') chunk_list = str(fp.read()) if old_ext == '.webm': chunk_list = chunk_list.replace( 'file ' + "'" + old_chunk_file_name + AUDIO_FILE_EXTENSION + "'", 'file ' + "'" + chunk_file + "'") if old_ext == '.webm' and ext == '.webm': chunk_list = chunk_list.replace( 'file ' + "'" + old_chunk_file_name + AUDIO_FILE_EXTENSION + "'", 'file ' + "'" + chunk_file_name + AUDIO_FILE_EXTENSION + "'") if ext == '.webm': chunk_list = chunk_list.replace( 'file ' + "'" + old_chunk_file + "'", 'file ' + "'" + chunk_file_name + AUDIO_FILE_EXTENSION + "'") else: chunk_list = chunk_list.replace('file ' + "'" + old_chunk_file + "'", 'file ' + "'" + chunk_file + "'") print(chunk_list) fp.close() fp = open(CHUNKS_LIST_FILE_NAME, 'w') fp.write(chunk_list) fp.close() # modify subtitle file subs = pysrt.open('../' + SUBTITLE_FILE_NAME + SUBTITLE_FILE_EXTENSION, encoding='utf-8') old_text = subs[chunk['chunk_no']] old_text.text = new_subtitle subs.save('../' + SUBTITLE_FILE_NAME + SUBTITLE_FILE_EXTENSION, encoding='utf-8') start_time = chunk['start_time'] end_time = chunk['end_time'] time_format = '%H:%M:%S' VideoTutorial.objects.filter(pk=video_id).update(status='in_queue') diff = datetime.strptime(end_time, time_format) - datetime.strptime( start_time, time_format) if (ext == '.webm'): # webm support print('webm') os.system('ffmpeg -i ' + chunk_file + ' -vn -c:a libmp3lame -ar ' + AUDIO_SAMPLE_RATE + ' -ab ' + AUDIO_BIT_RATE + ' ' + chunk_file_name + AUDIO_FILE_EXTENSION) chunk_file = chunk_file_name + AUDIO_FILE_EXTENSION # obj = VideoChunk.objects.get(VideoTutorial=video_id, chunk_no=chunk['chunk_no']) # obj.audio_chunk = os.path.join(settings.VIDEO_PROCESSING_ROOT, video_id, CHUNKS_DIRECTORY, # chunk_file_name + AUDIO_FILE_EXTENSION) # obj.save() os.rename(chunk_file, 'temp' + AUDIO_FILE_EXTENSION) os.system('ffmpeg -i temp' + AUDIO_FILE_EXTENSION + ' -ab ' + AUDIO_BIT_RATE + ' -ar ' + AUDIO_SAMPLE_RATE + ' -c copy -map_metadata -1 temp1' + AUDIO_FILE_EXTENSION) # getting the length of audio audio_length_format = '%H:%M:%S.%f' audio_length_str = str(os.popen( "ffprobe -v error -sexagesimal -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 temp1" + AUDIO_FILE_EXTENSION) \ .read()) audio_length_str = audio_length_str.rstrip() audio_start_time = start_time + '.000000' audio_end_time = end_time + '.000000' audio_len = datetime.strptime(audio_length_str, audio_length_format) - datetime.strptime( "00:00:00.000000", audio_length_format) audio_diff = datetime.strptime(audio_end_time, audio_length_format) - datetime.strptime( audio_start_time, audio_length_format) if audio_len < audio_diff: print('less') # add some silence abs_diff = str(abs(audio_diff - audio_len)) os.system("ffmpeg -y -f lavfi -i anullsrc=sample_rate=" + AUDIO_SAMPLE_RATE + " -ab " + AUDIO_BIT_RATE + " -t " + abs_diff + " silence" + AUDIO_FILE_EXTENSION) os.system('ffmpeg -y -i "concat:temp1' + AUDIO_FILE_EXTENSION + '|silence' + AUDIO_FILE_EXTENSION + '" -acodec copy temp2' + AUDIO_FILE_EXTENSION) command = str("ffmpeg -y -i temp2" + AUDIO_FILE_EXTENSION + " -ss 00:00:00.000 " + " -to " + str(diff) + " -c copy " + chunk_file) else: print('more') command = str("ffmpeg -y -i temp1" + AUDIO_FILE_EXTENSION + " -ss 00:00:00.000 " + " -to " + str(diff) + " -c copy " + chunk_file) os.system(command) os.remove('temp' + AUDIO_FILE_EXTENSION) os.remove('temp1' + AUDIO_FILE_EXTENSION) if os.path.exists('temp2' + AUDIO_FILE_EXTENSION): os.remove('temp2' + AUDIO_FILE_EXTENSION) compile_all_chunks(video_id)
def process_video(video_id): """ this function will break the uploaded video into chunks and will store audio separately """ folder_path = os.path.join(settings.MEDIA_ROOT, settings.VIDEO_PROCESSING_ROOT, video_id) os.chdir(folder_path) # convert to mp4 os.system('ffmpeg -y -i ' + VIDEO_FILE_NAME + INCOMING_VIDEO_EXTENSION + ' -max_muxing_queue_size 1024 -c:v libx264 -c:a libmp3lame ' + ' -ab ' + AUDIO_BIT_RATE + ' -ar ' + AUDIO_SAMPLE_RATE + ' ' + VIDEO_FILE_NAME + VIDEO_FILE_EXTENSION) # extract video os.system('ffmpeg -y -i ' + VIDEO_FILE_NAME + VIDEO_FILE_EXTENSION + ' -c copy -an ' + VIDEO_WITHOUT_AUDIO_FILE_NAME + VIDEO_FILE_EXTENSION) # extract audio os.system('ffmpeg -y -i ' + VIDEO_FILE_NAME + VIDEO_FILE_EXTENSION + ' -ab ' + AUDIO_BIT_RATE + ' -ar ' + AUDIO_SAMPLE_RATE + ' -vn -c copy ' + AUDIO_FILE_NAME + AUDIO_FILE_EXTENSION) chunk_directory = os.path.join(folder_path, CHUNKS_DIRECTORY) os.mkdir(chunk_directory) fp = open(SUBTITLE_FILE_NAME + SUBTITLE_FILE_EXTENSION, 'r') subtitle_text = str(fp.read()) x = re.sub(r'[0-9][0-9]:[0-5][0-9]:[0-5][0-9](?!,)', repl, subtitle_text) fp.close() fp = open(SUBTITLE_FILE_NAME + SUBTITLE_FILE_EXTENSION, 'w') fp.write(x) fp.close() compile_video_list = open(CHUNKS_DIRECTORY + '/' + CHUNKS_LIST_FILE_NAME, 'w+') subs = pysrt.open(SUBTITLE_FILE_NAME + SUBTITLE_FILE_EXTENSION, encoding='utf-8') VideoTutorial.objects.filter(pk=video_id).update(total_chunks=len(subs)) for i in range(len(subs)): sub_text = str(subs[i].text) start_time = str(subs[i].start).replace(',', '.') end_time = str(subs[i].end).replace(',', '.') nos_audio_file_name = chunk_directory + "/" + 'h_' + str( i) + AUDIO_FILE_EXTENSION # for the first video without subtitle if (i == 0) and (start_time != '00:00:00.000'): command = str("ffmpeg -i " + AUDIO_FILE_NAME + AUDIO_FILE_EXTENSION + " -ss 00:00:00.000 " + " -to " + start_time + " -c copy " + nos_audio_file_name) os.system(command) compile_video_list.write("file '" + 'h_' + str(i) + AUDIO_FILE_EXTENSION + "'\n") if i != 0: nos_start_time = str(subs[i - 1].end).replace(',', '.') nos_end_time = str(subs[i].start).replace(',', '.') if nos_start_time != nos_end_time: command = str("ffmpeg -i " + AUDIO_FILE_NAME + AUDIO_FILE_EXTENSION + " -ss " + nos_start_time + " -to " + nos_end_time + " -c copy " + nos_audio_file_name) os.system(command) compile_video_list.write("file '" + 'h_' + str(i) + AUDIO_FILE_EXTENSION + "'\n") audio_file_name = chunk_directory + "/" + str(i) + AUDIO_FILE_EXTENSION command = str("ffmpeg -i " + AUDIO_FILE_NAME + AUDIO_FILE_EXTENSION + " -ss " + start_time + " -to " + end_time + " -c copy " + audio_file_name) os.system(command) compile_video_list.write("file '" + str(i) + AUDIO_FILE_EXTENSION + "'\n") VideoChunk.objects.create( chunk_no=i, VideoTutorial=VideoTutorial.objects.get(id=video_id), audio_chunk=os.path.join(settings.VIDEO_PROCESSING_ROOT, video_id, CHUNKS_DIRECTORY, str(i) + AUDIO_FILE_EXTENSION), start_time=start_time, end_time=end_time, subtitle=sub_text.encode()) if i == len(subs) - 1: nos_audio_file_name = chunk_directory + "/" + 'h_' + str( i + 1) + AUDIO_FILE_EXTENSION command = str("ffmpeg -i " + AUDIO_FILE_NAME + AUDIO_FILE_EXTENSION + " -ss " + end_time + " -c copy " + nos_audio_file_name) os.system(command) compile_video_list.write("file '" + 'h_' + str(i + 1) + AUDIO_FILE_EXTENSION + "'\n") compile_video_list.close() compile_all_chunks(video_id)
def get_movie_data(movies_list): # INIT OBJECT for i in movies_list: class movie(object): def __init__(self): self.title = "" self.year = "" self.rating = "" self.subtitles = "" def __repr__(self): return str(self) # h2 = i.find('div', attrs = {'class':'article_movie_title'}) # title = h2.a.text.strip() # year = h2.find('span', attrs = {'class':"subtle start-year"}).text.strip() # actors = [] # actors_div = i.find('div', attrs = {'class':'info cast'}).findAll('a') # for j in actors_div: # actor = j.text # actors.append(actor) # separator = ', ' # actors = separator.join(actors) # director = i.find('div', attrs = {'class':"info director"}).a.text.strip() movie.title = i # movie.year = '1991' #year.replace('(','').replace(')','') # movie.rating = rating.replace('%','') # movie.actors = actors # movie.director = director ## SCRAPE AND LOAD SUBTITLES url = "http://www.yifysubtitles.com/search?q=" movie_name = movie.title movie_fix = movie_name.lower().replace("½", " 1/2").replace( "é", "e").replace(" ", "+") # connect to url movie_url = url + movie_fix scheme, netloc, path, query, fragment = parse.urlsplit(movie_url) path = parse.quote(path) movie_url = parse.urlunsplit((scheme, netloc, path, query, fragment)) print(movie_url) try: source = urllib.request.urlopen(movie_url).read() soup_movie = BeautifulSoup(source, "html.parser", from_encoding="utf-8") except urllib.error.HTTPError: print('MISSING SEARCH', movie_name) pass # Searches through a table of movies if soup_movie.find("h3", string=movie.title): link = soup_movie.find( "h3", string=movie.title).find_parent("a").get("href") #print(link) parse_obj = urlparse(movie_url) url = parse_obj.scheme + "://" + parse_obj.netloc sub_url = url + link try: sub_source = urllib.request.urlopen(sub_url).read() soup_sub = BeautifulSoup(sub_source, "html.parser", from_encoding="utf-8") except urllib.error.HTTPError: print('MISSING PAGE', movie_name) pass # Searches through a list of subtitles if soup_sub.find("span", string="English"): yify = "https://www.yifysubtitles.com/" year = soup_sub.find('div', attrs={ 'class': "circle", 'data-info': "year" }).get("data-text") rating = soup_sub.find('div', attrs={ 'class': "circle", 'data-info': "Tomato" }).get("data-text") link_sub = soup_sub.find( "span", string="English").find_parent("tr").find( "a", { "class": "subtitle-download" }).get("href") link_sub = yify + link_sub #print(link_sub) sub_final = urllib.request.urlopen(link_sub).read() soup_final = BeautifulSoup(sub_final, "html.parser", from_encoding="utf-8") # Scrapes the subtitle url if soup_final.find("a", {"class": "btn-icon download-subtitle"}): link_final = soup_final.find( "a", { "class": "btn-icon download-subtitle" }).get("href") print(link_final) current_directory = os.getcwd() zip_directory = os.path.join(current_directory, r'zip') if not os.path.exists(zip_directory): os.makedirs(zip_directory) urllib.request.urlretrieve( link_final, '{}/{}.zip'.format(zip_directory, movie.title)) zip_file = '{}/{}.zip'.format(zip_directory, movie.title) dest = os.path.join(current_directory, r'subs') if not os.path.exists(dest): os.makedirs(dest) f = zipfile.ZipFile(zip_file) for file in f.namelist(): if file.endswith('.srt'): print('Extracting: ' + file) try: f.extract(file, path='subs') subs = pysrt.open(os.path.join(dest, file), encoding='iso-8859-1') file_name = '{}/{}.txt'.format( dest, movie.title) try: os.remove(file_name) except: print("Error while deleting file ", file_name) if subs: for sub in subs: with open(file_name, 'a') as f: #f.write("1\n") #f.write("{0} --> {1}\n".format(start_new, end_new)) f.write(sub.text) #movie.subtitles = sub_text with open(file_name) as fp: subtitles = fp.read() else: print("Error while writing file ", file) subtitles = "no link" except: print("Error while extracting file ", file) subtitles = "no link" else: subtitles = "no link" else: subtitles = "no link" else: subtitles = "no link" else: subtitles = "no link" movie.subtitles = subtitles if year is None: movie.year = "N/A" else: movie.year = year if rating is None: movie.rating = "N/A" else: movie.rating = rating print(movie.title, movie.year, movie.rating) movies.append(movie)
# Import everything needed to edit video clips from moviepy.editor import * import pysrt from moviepy.editor import concatenate_audioclips from tts_it import sub_to_audio from speed import speed_adjust # Load your movie # Give proper path to your movie videoclip = VideoFileClip("single_scene.mp4") audioclip = videoclip.audio #importing subtitles #provide full path to subtitle file subs = pysrt.open('sub_1.srt') # initializing some parameters last_e = '00:00:00.00' final_clip = audioclip.subclip(0, 0) def time_conv(_sub): ''' convert SubRipTime to usable format ''' last_e_h, last_e_m, last_e_s, last_e_ms = _sub last_e_ms = last_e_ms // 10 s = str(last_e_h) + ':' + str(last_e_m) + ':' + str(last_e_s) + '.' + str( last_e_ms) return s for sub in subs:
def process_video_with_srt(video_file): file_name, file_extension = os.path.splitext(video_file) srt_file_name = file_name + '_correct.srt' with open(srt_file_name, 'r', encoding=get_file_encode(srt_file_name)) as srt_file: # 字幕列表格式 # 0 = {str} # '11\n' # 1 = {str} # '00:00:21,770 --> 00:00:23,270\n' # 2 = {str} # '是我\n' # 3 = {str} # 'Hey, it’s me.\n' if os.path.isdir(file_name): shutil.rmtree(file_name, ignore_errors=True) os.makedirs(file_name) video_info_cmd = 'ffprobe -v quiet -print_format json -show_format -show_streams "{}"'.format( video_file) video_duration = float( json.loads(run_command(video_info_cmd))['format']['duration']) encode = get_file_encode(srt_file_name) subtitle_list = pysrt.open(srt_file_name, encoding=encode) srt_info_duration = (subtitle_list[-1].end - subtitle_list[0].start).to_time() srt_duration = srt_info_duration.hour * 3600 + srt_info_duration.minute * 60 + srt_info_duration.second if (video_duration > srt_duration + 5000): print('视频长度远大于字幕最大长度,可能不匹配') return for subtitle in subtitle_list: # 长度小于3秒的 不带字幕的 前1分钟内的 不切 if subtitle.duration.seconds < 3 or len( subtitle.text_without_tags ) == 0 or subtitle.start.minutes < 1: continue start_time = '{}'.format(subtitle.start).split(',')[0] # 加上1秒的缓冲 可能不够 subtitle.end.seconds += 1 end_time = '{}'.format(subtitle.end).split(',')[0] # 根据是否有中文判断是否是翻译 subtitle_lines = subtitle.text_without_tags.split('\n') subtitle_text_chn = '' subtitle_text_eng = '' for line in subtitle_lines: if check_contain_chinese(line): if mafan_text.is_traditional(line): subtitle_text_chn += mafan_text.simplify(line.strip()) else: subtitle_text_chn += line.strip() else: subtitle_text_eng += line.strip() # 没翻译 或者没原文 不切 if len(subtitle_text_eng) < 4 or len(subtitle_text_chn) == 0: continue # if subtitle_text_eng.startswith('Then I got really freaked out, and that'): # print(subtitle_text_eng) # else: # continue # 如果要保存 必须去掉特殊字符 subtitle_text_eng = validate_file_name(subtitle_text_eng) if os.name == 'nt': subtext = file_name + '\\' + subtitle_text_eng + '.mp4' else: subtext = file_name + '/' + subtitle_text_eng + '.mp4' # 如果有同名的文件 以最后一个为准 if os.path.isfile(subtext): os.remove(subtext) #https://stackoverflow.com/questions/20847674/ffmpeg-libx264-height-not-divisible-by-2 cmd = 'ffmpeg -i "{}" -ss {} -to {} -vf scale=560:-2 -c:v libx264 -c:a aac -crf 30 -ac 1 -preset veryslow "{}"'.format( video_file, start_time, end_time, subtext) print(cmd) rst = run_command(cmd)
def empty(update: Update, context: CallbackContext): file = pysrt.open(context.chat_data['file'], encoding='UTF-8') file[context.chat_data['line']].text = '' file.save(context.chat_data['file'], encoding='UTF-8') send.next_line(update, context)
## there is a file located below: ## https://storage.googleapis.com/qst-datasets/subtitles/Shrek-2001.srt ## there is also some started code below ## calculate the sentiment over the course of the movie script (Shrek) ## plot the sentiment arc over the movie ## # 0. get the file - the file in the browser will auto download # just make sure you have the file in your working directory # or if on colab ! wget https://storage.googleapis.com/qst-datasets/subtitles/Shrek-2001.srt # 1. get the file and parse import pysrt subs = pysrt.open('Shrek-2001.srt', encoding='iso-8859-1') ##################################### Quick example ##################################### Key Words in Context - Concordance ### ### powerful tool to look at a set of text (full corpus) and look for ### words before/after ### ### helpful for eda, look for patterns to help support data annotation, etc. # just in case # nltk.download('punkt') # get the data # SQL = "SELECT * FROM `questrom.datasets.topics`"
import pysrt from pydub import AudioSegment audio_name = 'vachtuongcu.mp3' sub_name = 'vachtuongcu.srt' audio_outdir = 'output' csv_output = 'output.csv' song = AudioSegment.from_file(audio_name) subs = pysrt.open(sub_name, encoding='utf-8') # Define lambda function convert time to miliseconds time_to_ms = lambda x: (x.hours * 3600 + x.minutes * 60 + x.seconds ) * 1000 + x.milliseconds # Extract data with open(csv_output, 'w') as fd: for sub in subs: # Get start time, end time in miliseconds start_ms = time_to_ms(sub.start) end_ms = time_to_ms(sub.end) # Audio extracted file name audio_extract_name = '{}/{}_{}_{}.wav'.format(audio_outdir, audio_name, start_ms, end_ms) text = str(sub.text.encode('utf-8')) # Extract file extract = song[start_ms:end_ms] # Saving extract.export(audio_extract_name, format="wav") # Write to csv file fd.write('{}|{}\n'.format(audio_extract_name, text))
srt_original = arguments[i + 1] elif arguments[i] == '-out': srt_target = arguments[i + 1] elif arguments[i] == '-source': source_lang = arguments[i + 1] elif arguments[i] == '-target': target_lang = arguments[i + 1] #for testing purposes gave the variables values srt_original = 'Dunkirk.2017.720p.BluRay.x264-SPARKS.srt' srt_target = 'fullTranslationInSpanish.srt' source_lang = 'EN' target_lang = 'ES' import pysrt subs = pysrt.open(srt_original) #holds the srt inpur file import deepl words_per_screen = [ ] #holds number of words on that frame of the srt file before translation for i in range(len( subs)): # fills words per screen with the number of words per frame words = subs[i].text.split(' ') for word in range(len(words)): words[word] = words[word].replace('\n', ' ') split = words[word].split(' ') if len(split) > 1: words[word] = split[0] words.insert(word + 1, split[1]) words_per_screen.append(len(words))
def parse(self, file_name, file_encoding): self._sub_entries = pysrt.open(file_name, file_encoding)
import pysrt,simplekml,csv video_path = '/home/mayank/Documents/Skylark_Drones_Tasks/software_dev/videos/DJI_0301.SRT' # video_path(SRT) and its name video = pysrt.open(video_path) kml=simplekml.Kml() for frame in video: lat_lon = frame.text.split(',') # seperating time of frame, latitude and longitude kml.newpoint(name = str(frame.start), coords = [(lat_lon[0],lat_lon[1])]) kml.save('drone.kml')
from docx import Document from docx.shared import Inches import pysrt in_file = 'motocultor.srt' doc_name = in_file.split('.')[0] out_file = in_file.replace('.srt', '2.docx') subs = pysrt.open(in_file, encoding='ansi') document = Document() document.add_paragraph(doc_name.upper() + '.') document.add_paragraph('(Tiempo de duración: )\n') document.add_picture('monty-truth.jpg', width=Inches(1.25)) for sub in subs: print(sub.start) print(sub.text) document.add_paragraph(str(sub.start)) document.add_paragraph(sub.text + '\n') document.save(out_file)
font = cv2.FONT_HERSHEY_SIMPLEX overlay = image.copy() cv2.rectangle(overlay, (1, height-15-11), (width-1, height-1), (255,0,0), -1) opacity = 0.4 cv2.addWeighted(overlay, opacity, image, 1 - opacity, 0, image) cv2.putText(image,text,(10,height-10), font, 0.4, (255,255,255),1) cv2.imwrite('frame%04d.jpg' % (ms/1000), image) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('video', help = 'path to video file') parser.add_argument('sub', help = 'path to sub file') args = parser.parse_args() subs = pysrt.open(args.sub) videofile = args.video for i in xrange(0, len(subs)): seconds_start = (subs[i].start.seconds) + (subs[i].start.minutes * 60) + (subs[i].start.hours * 60 * 60) seconds_end = (subs[i].start.seconds) + (subs[i].start.minutes * 60) + (subs[i].start.hours * 60 * 60) ms = mid(seconds_start*1000,seconds_end*1000) save_new_frame(videofile, ms, str(subs[i].text))
from googletrans import Translator translator = Translator() f1 = open('friends.srt', "r", encoding="utf-8") f2 = open('hindi2.srt', "a", encoding="utf-8") for line in f1: tr = translator.translate(line, dest='hi') print(tr.text) f2.write(tr.text + "\n") f1.close() f2.close() import pysrt subs = pysrt.open('hindi.srt') from pydub import AudioSegment from pydub.playback import play from gtts import gTTS import os def speed_swifter(sound, speed): return sound._spawn( sound.raw_data, overrides={"frame_rate": int(sound.frame_rate * speed)}) startmilli = 0 s = AudioSegment.silent(duration=0) for sub in subs:
#!//Users/tkirke/anaconda/bin/python import pysrt, sys, os fil = sys.argv[1] try: subs = pysrt.open(fil, encoding ='utf-8') except: subs = pysrt.open(fil, encoding ='iso-8859-1') size = len(subs) count = 0 for i in subs: s = subs[count] count = count+1 t = (44100*(60*s.start.minutes + s.start.seconds + 0.001*s.start.milliseconds)) print str(t)+"\t"+s.text
def read(srtfile): """Read subtitle""" return pysrt.open(srtfile)
def CreateCleanSubAndMuteList(self, cleanSubsFileSpec=None): subFileParts = os.path.splitext(self.inputSubsFileSpec) if cleanSubsFileSpec is not None: self.cleanSubsFileSpec = cleanSubsFileSpec subFileParts = os.path.splitext(self.cleanSubsFileSpec) self.cleanSubsNotModFileSpec = subFileParts[0] + "_all_not_cleaned" + subFileParts[1] else: #self.cleanSubsFileSpec = subFileParts[0] + "_clean" + subFileParts[1] subFileFirstParts = os.path.splitext(subFileParts[0]) self.cleanSubsFileSpec = subFileFirstParts[0] + ".clean" + subFileFirstParts[1] + ".forced" + subFileParts[1] #self.cleanSubsNotModFileSpec = subFileFirstParts[0] + ".all_not_cleaned" + subFileFirstParts[1] + subFileParts[1] self.cleanSubsNotModFileSpec = subFileFirstParts[0] + '.clean' + subFileFirstParts[1] + subFileParts[1] if os.path.isfile(self.inputSubsFileSpec): shutil.copyfile(self.inputSubsFileSpec, subFileFirstParts[0] + '.orig' + subFileFirstParts[1] + subFileParts[1]) # remove brackets that interfere with ffmpeg subtitles filter self.cleanSubsFileSpec = self.cleanSubsFileSpec.translate({ord(x): '' for x in ['[',']']}) lines = [] with open(self.swearsFileSpec) as f: lines = [line.rstrip('\n') for line in f] for line in lines: lineMap = line.split("|") if len(lineMap) > 1: self.swearsMap[lineMap[0]] = lineMap[1] else: self.swearsMap[lineMap[0]] = "*****" replacer = re.compile(r'\b(' + '|'.join(self.swearsMap.keys()) + r')\b', re.IGNORECASE) blob = open(self.inputSubsFileSpec, 'rb').read() m = magic.open(magic.MAGIC_MIME_ENCODING) m.load() encoding = m.buffer(blob) subs = pysrt.open(self.inputSubsFileSpec, encoding=encoding) newSubs = pysrt.SubRipFile() newSubsNotMod = pysrt.SubRipFile() for sub in subs: newText = replacer.sub(lambda x: self.swearsMap[x.group()], sub.text) #print("old: "+sub.text+", new: "+newText) if (newText != sub.text): newSub = sub newSub.text = newText newSubs.append(newSub) #else: newSubsNotMod.append(sub) newSubs.save(self.cleanSubsFileSpec) newSubsNotMod.save(self.cleanSubsNotModFileSpec) newLines = [] for sub in newSubs: newLines.append([sub.start.to_time(), sub.end.to_time()]) self.muteTimeList = [] for timePair in newLines: lineStart = (timePair[0].hour * 60.0 * 60.0) + (timePair[0].minute * 60.0) + timePair[0].second + (timePair[0].microsecond / 1000000.0) lineEnd = (timePair[1].hour * 60.0 * 60.0) + (timePair[1].minute * 60.0) + timePair[1].second + (timePair[1].microsecond / 1000000.0) self.muteTimeList.append("volume=enable='between(t," + format(lineStart, '.3f') + "," + format(lineEnd, '.3f') + ")':volume=0")
def makeGif(source, sub_index, rand=False, no_quote=False, custom_subtitle=""): config = ConfigParser.ConfigParser() config.read("config.cfg") config.sections() vlc_path = config.get("general", "vlc_path") video_path = config.get("general", "ep" + str(source) + "_path") screencap_path = os.path.join(os.path.dirname(__file__), "screencaps") # delete the contents of the screencap path file_list = os.listdir(screencap_path) for file_name in file_list: os.remove(os.path.join(screencap_path, file_name)) # read in the quotes for the selected movie subs = pysrt.open(sub_files[source]) if rand: sub_index = random.randint(0, len(subs) - 1) if no_quote: start = (3600 * subs[sub_index].end.hours) + ( 60 * subs[sub_index].end.minutes) + subs[sub_index].end.seconds + ( 0.001 * subs[sub_index].end.milliseconds) end = (3600 * subs[sub_index + 1].start.hours) + ( 60 * subs[sub_index + 1].start.minutes) + subs[sub_index + 1].start.seconds + ( 0.001 * subs[sub_index + 1].start.milliseconds) else: start = (3600 * subs[sub_index].start.hours) + ( 60 * subs[sub_index].start.minutes) + subs[sub_index].start.seconds + ( 0.001 * subs[sub_index].start.milliseconds) end = (3600 * subs[sub_index].end.hours) + ( 60 * subs[sub_index].end.minutes) + subs[sub_index].end.seconds + ( 0.001 * subs[sub_index].end.milliseconds) text = striptags(subs[sub_index].text).split("\n") if len(custom_subtitle) > 0: text = [custom_subtitle] # tell vlc to go get images for gifs cmd = " ".join([ '"{vlc_path}"', '-Idummy', '--video-filter', 'scene', '-V', 'dummy', '--no-audio', '--scene-height=256', '--scene-width=512', '--scene-format=png', '--scene-ratio=1', '--start-time={start}', '--stop-time={end}', '--scene-prefix=thumb', '--scene-path="{screencap_path}"', '"{video_path}"', 'vlc://quit' ]).format( **{ "vlc_path": vlc_path, "start": start, "end": end, "screencap_path": screencap_path, "video_path": video_path }) os.popen(cmd) file_names = sorted((fn for fn in os.listdir(screencap_path))) images = [] font = ImageFont.truetype("fonts/DejaVuSansCondensed-BoldOblique.ttf", 16) # remove the first image from the list file_names.pop(0) for f in file_names: try: image = Image.open(os.path.join(screencap_path, f)) draw = ImageDraw.Draw(image) try: image_size except NameError: image_size = image.size # deal with multi-line quotes try: if len(text) == 2: # at most 2? text_size = font.getsize(text[0]) x = (image_size[0] / 2) - (text_size[0] / 2) y = image_size[1] - (2 * text_size[1]) - 5 # padding drawText(draw, x, y, text[0], font) text_size = font.getsize(text[1]) x = (image_size[0] / 2) - (text_size[0] / 2) y += text_size[1] drawText(draw, x, y, text[1], font) else: text_size = font.getsize(text[0]) x = (image_size[0] / 2) - (text_size[0] / 2) y = image_size[1] - text_size[1] - 5 # padding drawText(draw, x, y, text[0], font) except NameError: pass # do nothing. # if not all black? if image.getbbox(): # add it to the array images.append(array(image)) print 'image appended.' else: print 'all black frame found.' except IOError: print 'empty frame found.' filename = "star_wars.gif" # create a f****n' gif print "generating gif..." writeGif(filename, images, nq=10, dither=True) if rand: try: return text except: return []
def test(): text = '' for srt in glob.glob("youtube-dl-texts/*.srt"): tmp = pysrt.open(srt) text += tmp.text return text.replace('\n', ' ')
import matplotlib.pyplot as plt import scipy from swda_time import CorpusReader from helpers import changetime, plottrans ################# # SUBTITLES ################# # Read in all subtitles: path = '../Subtitles/' allsubs = {} for filename in os.listdir(path): filename = os.path.join(path, filename) subs = pysrt.open(filename, encoding='iso-8859-1') subdict = {} i = 0 for sub in subs: if not ( sub.text.startswith('(') and sub.text.endswith(')')) or not ( sub.text.startswith('[') and sub.text.endswith(']')): start = changetime(sub.start) end = changetime(sub.end) if sub.text.endswith('...'): laststart = changetime(sub.start) elif sub.text.startswith('...'): times = (laststart, end) subdict[i] = times i += 1 #give the same timestamp if text is said directly after eachother elif sub.text.startswith('-'): times1 = (start, end)
print 'here' pass self.slidend = self.start self.w = Scale(self.root, from_=0, to=self.slidend / 1000, length=900, orient=HORIZONTAL) self.w.pack() def updateText(self, val, verbose=""): if verbose: print(verbose) self.sub_var.set(val) def update(self, start, duration, text): self.updateText(text, verbose="%i (%i): %s" % (start, duration, text)) if __name__ == "__main__": filename = r"C:\Users\admin\Desktop\Python-files\Deskinfo\eternal-loveSUBS\Eternal.Love.E14.srt" print('Starting %s' % filename) filename = r"C:\Users\admin\Desktop\Python-files\Deskinfo\eternal-loveSUBS\Eternal.Love.E14.srt" subs = pysrt.open(filename) root = tk.Tk() app = Application(root=root, subs=subs) app.mainloop()
def load_srt(path: pathlib.Path): return pysrt.open(path)