def read_captions(captions, options): reader_kwargs = { 'read_invalid_positioning': options.read_invalid_positioning } scc_reader = pycaption.SCCReader(**reader_kwargs) srt_reader = pycaption.SRTReader(**reader_kwargs) sami_reader = pycaption.SAMIReader(**reader_kwargs) dfxp_reader = pycaption.DFXPReader(**reader_kwargs) vtt_reader = pycaption.WebVTTReader(**reader_kwargs) if scc_reader.detect(captions): if options.lang: return scc_reader.read(captions, lang=options.lang, offset=int(options.offset)) else: return scc_reader.read(captions, offset=int(options.offset)) elif srt_reader.detect(captions): return srt_reader.read(captions) elif sami_reader.detect(captions): return sami_reader.read(captions) elif dfxp_reader.detect(captions): return dfxp_reader.read(captions) elif vtt_reader.detect(captions): return vtt_reader.read(captions) else: raise Exception('No caption format detected :(')
def mk_subs(self, transcriptions, sub_pathname): """ Create a subtitle file for this video. It is currently a huge hack, but it works good enough. transcriptions: list of start/end 'pointers' into the source sub_pathname: full path to output file """ transcript_filename = '12022017 NBPY SCC.scc' # dt = transcript_filename[:8] transcript_pathname = os.path.join(self.show_dir, "assets", "transcripts", transcript_filename) # transcript_start = datetime.datetime.strptime( # dt + " 10:06:56", '%m%d%Y %H:%M:%S' ) - \ # datetime.timedelta(0, 2, 158933) caps = open(transcript_pathname, encoding='iso-8859-1').read() transcript = pycaption.SCCReader().read(caps) language = transcript.get_languages()[0] # ['en-US'] captions = transcript.get_captions(language) out_captions = pycaption.CaptionList() for transcription in transcriptions: state = 0 for c in captions: if c.format_start() == \ transcription['start']['timestamp']: state = 1 offset = c.start - transcription['start'][ 'video_time'] * 1000000 c.nodes[0].content = transcription['start']['text'] if state == 1: if c.format_start() == \ transcription['end']['timestamp']: c.nodes[0].content=\ transcription['end']['text'] state = 0 c.start -= offset c.end -= offset out_captions.append(c) transcript.set_captions(language, out_captions) # writer = pycaption.DFXPWriter() writer = pycaption.SRTWriter() open(sub_pathname, 'wt').write(writer.write(transcript)) return
def v4(self, episode): epoch = datetime.datetime(2017, 12, 2, 10, 6, 36, 841067) # 2017-12-02 10:06:36.841067 ## Get transcription data transcript_filename = '12022017 NBPY SCC.scc' transcript_pathname = os.path.join( self.show_dir, "assets", "transcripts", transcript_filename ) caps = open(transcript_pathname, encoding='iso-8859-1').read() transcript = pycaption.SCCReader().read( caps ) language = transcript.get_languages()[0] # ['en-US'] captions = transcript.get_captions( language ) cls = Cut_List.objects.filter( episode=episode, apply=True).order_by('sequence') # transcriptions = get_transcriptions(cls) for cl in cls: print( cl.get_start_wall() ) cl_start = ( cl.get_start_wall() - epoch ).total_seconds() * 1000000 cl_end = ( cl.get_end_wall() - epoch ).total_seconds() * 1000000 state = 0 for c in captions: # look for start if state == 0: if c.start > cl_start - 4000000: print( "start: {}".format(cl.start)) state = 1 # print a bunch of start if state == 1: print("{} {}".format(c.format_start(), c.get_text() )) if c.start > cl_start + 4000000: print() state = 2 # look for end if state == 2: if c.start > cl_end - 4000000: print( "end: {}".format(cl.end)) state = 3 # print a bunch of end if state == 3: print("{} {}".format(c.format_start(), c.get_text() )) if c.start > cl_end + 4000000: print() state = 4
def main(argv): inputfile = '' inputType = '' outputType = '' try: opts, args = getopt.getopt(argv, "h:i:f:t:") except getopt.GetoptError: print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-f", "--sfile"): inputType = arg elif opt in ("-t", "--tfile"): outputType = arg if inputType == outputType: print('Error: input type and output type are same format') sys.exit(1) with io.open(inputfile) as f: str1 = f.read() inputValue = inputType.lower() if inputValue == 'scc': c = pycaption.SCCReader().read(str1) elif inputValue == 'srt': c = pycaption.SRTReader().read(str1) elif inputValue == 'dfxp': c = pycaption.DFXPReader().read(str1) elif inputValue == 'webvtt': c = pycaption.WebVTTReader().read(str1) else: print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1) outputValue = outputType.lower() if outputValue == 'scc': print(pycaption.SCCWriter().write(c)) elif outputValue == 'srt': print(pycaption.SRTWriter().write(c)) elif outputValue == 'dfxp': print(pycaption.DFXPWriter().write(c)) elif outputValue == 'webvtt': print(pycaption.WebVTTWriter().write(c)) else: print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1)
def v3(self, episode): ## Get transcription data transcript_filename = '12022017 NBPY SCC.scc' transcript_pathname = os.path.join( self.show_dir, "assets", "transcripts", transcript_filename ) caps = open(transcript_pathname, encoding='iso-8859-1').read() transcript = pycaption.SCCReader().read( caps ) language = transcript.get_languages()[0] # ['en-US'] captions = transcript.get_captions( language ) ## Get markes for this video cls = Cut_List.objects.filter( episode=episode, apply=True).order_by('sequence') transcriptions = get_transcriptions(cls) for transcription in transcriptions: pprint(transcription) state = 0 for c in captions: if c.format_start() == \ transcription['start']['timestamp']: state=1 offset = c.start - transcription['start']['video_time'] * 1000000 wc = transcription['start']['wallclock'] # walltime that transcription file started. epoch = wc - datetime.timedelta(microseconds = c.start ) print( "c: {c}\nc.start: {start}\nwall_clock: {wallclock}".format( c=c, start=c.start, wallclock=wc ) ) print("epoch: {}".format( epoch )) print("import sys; sys.exit()"); import code; code.interact(local=locals()) if state==1: if c.format_start() == \ transcription['end']['timestamp']: c.nodes[0].content=\ transcription['end']['text'] state = 0 c.start -= offset c.end -= offset
def read_captions(captions, options): scc_reader = pycaption.SCCReader() srt_reader = pycaption.SRTReader() sami_reader = pycaption.SAMIReader() dfxp_reader = pycaption.DFXPReader() if scc_reader.detect(captions): if options.lang: return scc_reader.read(captions, lang=options.lang, offset=int(options.offset)) else: return scc_reader.read(captions, offset=float(options.offset)) elif srt_reader.detect(captions): return srt_reader.read(captions) elif sami_reader.detect(captions): return sami_reader.read(captions) elif dfxp_reader.detect(captions): return dfxp_reader.read(captions) else: raise Exception('No caption format detected :(')
def v6(self, episode): def show_near( x, wall ): from_epoch = ( wall - epoch ).total_seconds() * 1000000 state = 0 for c in captions: if state == 0: if c.start > from_epoch - 9000000: print( "{}: {}".format(x, wall)) state = 1 if state == 1: print("{} {}".format(c.format_start(), c.get_text() )) if c.start > from_epoch + 26000000: print() return epoch = datetime.datetime(2017, 12, 2, 10, 6, 36, 841067) # 2017-12-02 10:06:36.841067 ## Get transcription data transcript_filename = '12022017 NBPY SCC.scc' transcript_pathname = os.path.join( self.show_dir, "assets", "transcripts", transcript_filename ) caps = open(transcript_pathname, encoding='iso-8859-1').read() transcript = pycaption.SCCReader().read( caps ) language = transcript.get_languages()[0] # ['en-US'] captions = transcript.get_captions( language ) cls = Cut_List.objects.filter( episode=episode, apply=True).order_by('sequence') show_near( "start", cls.first().get_start_wall() ) show_near( "end", cls.last().get_end_wall() )