def read_captions(captions, options): reader_kwargs = { 'read_invalid_positioning': options.read_invalid_positioning } scc_reader = pycaption.SCCReader(**reader_kwargs) srt_reader = pycaption.SRTReader(**reader_kwargs) sami_reader = pycaption.SAMIReader(**reader_kwargs) dfxp_reader = pycaption.DFXPReader(**reader_kwargs) vtt_reader = pycaption.WebVTTReader(**reader_kwargs) if scc_reader.detect(captions): if options.lang: return scc_reader.read(captions, lang=options.lang, offset=int(options.offset)) else: return scc_reader.read(captions, offset=int(options.offset)) elif srt_reader.detect(captions): return srt_reader.read(captions) elif sami_reader.detect(captions): return sami_reader.read(captions) elif dfxp_reader.detect(captions): return dfxp_reader.read(captions) elif vtt_reader.detect(captions): return vtt_reader.read(captions) else: raise Exception('No caption format detected :(')
def extract_captions(cap_fname, lang='en-US'): """ Reads a list of captions and returns an ordered dictionary of {(start_time, end_time) -> "caption"} with time in units of seconds. :param cap_fname: VTT subtitle file to read from. Produces Caption sets with text, and times in microseconds. """ assert os.path.isfile(cap_fname) _getSharedLogger().info("Reading captions from '%s'", cap_fname) reader = pycaption.WebVTTReader() res = collections.OrderedDict() with open(cap_fname) as fin: captions_raw = fin.read() assert reader.detect(captions_raw), "Malformed file: '{}'".format( cap_fname) caption_set = reader.read(captions_raw) assert not caption_set.is_empty(), "Empty VTT file: '{}'".format( cap_fname) # REVIEW josephz: We'll need to check what other possibilities there are. assert lang in caption_set.get_languages() captions = caption_set.get_captions(lang=lang) assert len(captions) > 0 _getSharedLogger().info("Detected '%s' captions...", len(captions)) for c in captions: cap_raw = c.get_text() start = _time.micros_to_sec(c.start) end = _time.micros_to_sec(c.end) res[(start, end)] = cap_raw.strip() assert len(res) == len(captions) return res
def get_transcript(link: str) -> pd.DataFrame: """Parse and collect transcript given link :param link: str -> YouTube uRL of the resource :return df: pd.DataFrame -> DataFrame containing required data """ ydl = youtube_dl.YoutubeDL({ 'subtitlesformat': 'vtt', 'quiet': True, 'forcetitle': True, 'writeautomaticsub': True, 'simulate': True }) raw = ydl.extract_info(link, download=False) unique_id, title = raw['display_id'], raw['title'] print(f"Video - {unique_id}: {title}") try: sub_url = raw['requested_subtitles']['en']['url'] resp = requests.get(sub_url, stream=True) bytes_ = BytesIO() [bytes_.write(block) for block in resp.iter_content(1024)] bytes_.seek(0) arr = pycaption.WebVTTReader().read(bytes_.read().decode('utf-8')) transcript = arr.get_captions('en-US') df = parse_transcript(transcript) except KeyError: print(f"{title} [{unique_id}] has no English subtitles! Exiting ...") return pd.DataFrame([], columns=['text']) return df
def main(argv): inputfile = '' inputType = '' outputType = '' try: opts, args = getopt.getopt(argv, "h:i:f:t:") except getopt.GetoptError: print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('test.py -i <inputfile> -f <intputType> -t <outputType>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-f", "--sfile"): inputType = arg elif opt in ("-t", "--tfile"): outputType = arg if inputType == outputType: print('Error: input type and output type are same format') sys.exit(1) with io.open(inputfile) as f: str1 = f.read() inputValue = inputType.lower() if inputValue == 'scc': c = pycaption.SCCReader().read(str1) elif inputValue == 'srt': c = pycaption.SRTReader().read(str1) elif inputValue == 'dfxp': c = pycaption.DFXPReader().read(str1) elif inputValue == 'webvtt': c = pycaption.WebVTTReader().read(str1) else: print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1) outputValue = outputType.lower() if outputValue == 'scc': print(pycaption.SCCWriter().write(c)) elif outputValue == 'srt': print(pycaption.SRTWriter().write(c)) elif outputValue == 'dfxp': print(pycaption.DFXPWriter().write(c)) elif outputValue == 'webvtt': print(pycaption.WebVTTWriter().write(c)) else: print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed') sys.exit(1)
def read_captions(captions, options): scc_reader = pycaption.SCCReader() srt_reader = pycaption.SRTReader() sami_reader = pycaption.SAMIReader() dfxp_reader = pycaption.DFXPReader() webvtt_reader = pycaption.WebVTTReader() if scc_reader.detect(captions): if options.lang: return scc_reader.read(captions, lang=options.lang, offset=int(options.offset)) else: return scc_reader.read(captions, offset=int(options.offset)) elif srt_reader.detect(captions): return srt_reader.read(captions) elif sami_reader.detect(captions): return sami_reader.read(captions) elif dfxp_reader.detect(captions): return dfxp_reader.read(captions) elif webvtt_reader.detect(captions): return webvtt_reader.read(captions) else: raise Exception('No caption format detected :(')