Ejemplo n.º 1
0
def read_captions(captions, options):
    reader_kwargs = {
        'read_invalid_positioning': options.read_invalid_positioning
    }

    scc_reader = pycaption.SCCReader(**reader_kwargs)
    srt_reader = pycaption.SRTReader(**reader_kwargs)
    sami_reader = pycaption.SAMIReader(**reader_kwargs)
    dfxp_reader = pycaption.DFXPReader(**reader_kwargs)
    vtt_reader = pycaption.WebVTTReader(**reader_kwargs)

    if scc_reader.detect(captions):
        if options.lang:
            return scc_reader.read(captions,
                                   lang=options.lang,
                                   offset=int(options.offset))
        else:
            return scc_reader.read(captions, offset=int(options.offset))
    elif srt_reader.detect(captions):
        return srt_reader.read(captions)
    elif sami_reader.detect(captions):
        return sami_reader.read(captions)
    elif dfxp_reader.detect(captions):
        return dfxp_reader.read(captions)
    elif vtt_reader.detect(captions):
        return vtt_reader.read(captions)
    else:
        raise Exception('No caption format detected :(')
Ejemplo n.º 2
0
def extract_captions(cap_fname, lang='en-US'):
    """ Reads a list of captions and returns an ordered dictionary of {(start_time, end_time) -> "caption"}
  with time in units of seconds.

  :param cap_fname: VTT subtitle file to read from. Produces Caption sets with text, and times in microseconds.
  """
    assert os.path.isfile(cap_fname)
    _getSharedLogger().info("Reading captions from '%s'", cap_fname)
    reader = pycaption.WebVTTReader()
    res = collections.OrderedDict()
    with open(cap_fname) as fin:
        captions_raw = fin.read()
        assert reader.detect(captions_raw), "Malformed file: '{}'".format(
            cap_fname)

        caption_set = reader.read(captions_raw)
        assert not caption_set.is_empty(), "Empty VTT file: '{}'".format(
            cap_fname)
        # REVIEW josephz: We'll need to check what other possibilities there are.
        assert lang in caption_set.get_languages()

        captions = caption_set.get_captions(lang=lang)
        assert len(captions) > 0

    _getSharedLogger().info("Detected '%s' captions...", len(captions))
    for c in captions:
        cap_raw = c.get_text()
        start = _time.micros_to_sec(c.start)
        end = _time.micros_to_sec(c.end)
        res[(start, end)] = cap_raw.strip()
    assert len(res) == len(captions)
    return res
Ejemplo n.º 3
0
def get_transcript(link: str) -> pd.DataFrame:
    """Parse and collect transcript given link

    :param link: str -> YouTube uRL of the resource
    :return df: pd.DataFrame -> DataFrame containing required data
    """
    ydl = youtube_dl.YoutubeDL({
        'subtitlesformat': 'vtt',
        'quiet': True,
        'forcetitle': True,
        'writeautomaticsub': True,
        'simulate': True
    })
    raw = ydl.extract_info(link, download=False)
    unique_id, title = raw['display_id'], raw['title']
    print(f"Video - {unique_id}: {title}")
    try:
        sub_url = raw['requested_subtitles']['en']['url']
        resp = requests.get(sub_url, stream=True)
        bytes_ = BytesIO()
        [bytes_.write(block) for block in resp.iter_content(1024)]
        bytes_.seek(0)
        arr = pycaption.WebVTTReader().read(bytes_.read().decode('utf-8'))
        transcript = arr.get_captions('en-US')
        df = parse_transcript(transcript)
    except KeyError:
        print(f"{title} [{unique_id}] has no English subtitles! Exiting ...")
        return pd.DataFrame([], columns=['text'])
    return df
Ejemplo n.º 4
0
def main(argv):
    inputfile = ''
    inputType = ''
    outputType = ''

    try:
        opts, args = getopt.getopt(argv, "h:i:f:t:")
    except getopt.GetoptError:
        print('test.py -i <inputfile> -f <intputType> -t <outputType>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -f <intputType> -t <outputType>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-f", "--sfile"):
            inputType = arg
        elif opt in ("-t", "--tfile"):
            outputType = arg

    if inputType == outputType:
        print('Error: input type and output type are same format')
        sys.exit(1)

    with io.open(inputfile) as f:
        str1 = f.read()
    inputValue = inputType.lower()

    if inputValue == 'scc':
        c = pycaption.SCCReader().read(str1)
    elif inputValue == 'srt':
        c = pycaption.SRTReader().read(str1)
    elif inputValue == 'dfxp':
        c = pycaption.DFXPReader().read(str1)
    elif inputValue == 'webvtt':
        c = pycaption.WebVTTReader().read(str1)
    else:
        print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)

    outputValue = outputType.lower()
    if outputValue == 'scc':
        print(pycaption.SCCWriter().write(c))
    elif outputValue == 'srt':
        print(pycaption.SRTWriter().write(c))
    elif outputValue == 'dfxp':
        print(pycaption.DFXPWriter().write(c))
    elif outputValue == 'webvtt':
        print(pycaption.WebVTTWriter().write(c))
    else:
        print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)
Ejemplo n.º 5
0
def read_captions(captions, options):
    scc_reader = pycaption.SCCReader()
    srt_reader = pycaption.SRTReader()
    sami_reader = pycaption.SAMIReader()
    dfxp_reader = pycaption.DFXPReader()
    webvtt_reader = pycaption.WebVTTReader()

    if scc_reader.detect(captions):
        if options.lang:
            return scc_reader.read(captions,
                                   lang=options.lang,
                                   offset=int(options.offset))
        else:
            return scc_reader.read(captions, offset=int(options.offset))
    elif srt_reader.detect(captions):
        return srt_reader.read(captions)
    elif sami_reader.detect(captions):
        return sami_reader.read(captions)
    elif dfxp_reader.detect(captions):
        return dfxp_reader.read(captions)
    elif webvtt_reader.detect(captions):
        return webvtt_reader.read(captions)
    else:
        raise Exception('No caption format detected :(')