Ejemplo n.º 1
0
def read_captions(captions, options):
    reader_kwargs = {
        'read_invalid_positioning': options.read_invalid_positioning
    }

    scc_reader = pycaption.SCCReader(**reader_kwargs)
    srt_reader = pycaption.SRTReader(**reader_kwargs)
    sami_reader = pycaption.SAMIReader(**reader_kwargs)
    dfxp_reader = pycaption.DFXPReader(**reader_kwargs)
    vtt_reader = pycaption.WebVTTReader(**reader_kwargs)

    if scc_reader.detect(captions):
        if options.lang:
            return scc_reader.read(captions,
                                   lang=options.lang,
                                   offset=int(options.offset))
        else:
            return scc_reader.read(captions, offset=int(options.offset))
    elif srt_reader.detect(captions):
        return srt_reader.read(captions)
    elif sami_reader.detect(captions):
        return sami_reader.read(captions)
    elif dfxp_reader.detect(captions):
        return dfxp_reader.read(captions)
    elif vtt_reader.detect(captions):
        return vtt_reader.read(captions)
    else:
        raise Exception('No caption format detected :(')
Ejemplo n.º 2
0
def main(argv):
    inputfile = ''
    inputType = ''
    outputType = ''

    try:
        opts, args = getopt.getopt(argv, "h:i:f:t:")
    except getopt.GetoptError:
        print('test.py -i <inputfile> -f <intputType> -t <outputType>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -f <intputType> -t <outputType>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-f", "--sfile"):
            inputType = arg
        elif opt in ("-t", "--tfile"):
            outputType = arg

    if inputType == outputType:
        print('Error: input type and output type are same format')
        sys.exit(1)

    with io.open(inputfile) as f:
        str1 = f.read()
    inputValue = inputType.lower()

    if inputValue == 'scc':
        c = pycaption.SCCReader().read(str1)
    elif inputValue == 'srt':
        c = pycaption.SRTReader().read(str1)
    elif inputValue == 'dfxp':
        c = pycaption.DFXPReader().read(str1)
    elif inputValue == 'webvtt':
        c = pycaption.WebVTTReader().read(str1)
    else:
        print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)

    outputValue = outputType.lower()
    if outputValue == 'scc':
        print(pycaption.SCCWriter().write(c))
    elif outputValue == 'srt':
        print(pycaption.SRTWriter().write(c))
    elif outputValue == 'dfxp':
        print(pycaption.DFXPWriter().write(c))
    elif outputValue == 'webvtt':
        print(pycaption.WebVTTWriter().write(c))
    else:
        print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)
Ejemplo n.º 3
0
def download_subs(link, output_folder, name):
    x = requests.get(
        link,
        headers={
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:70.0) Gecko/20100101 Firefox/70.0'
        })
    caption_set = pycaption.DFXPReader().read(x.text)
    results = pycaption.SRTWriter().write(caption_set)
    with io.open(os.path.join(output_folder, name + '.srt'),
                 'w',
                 encoding='utf-8') as f:
        f.write(results)
Ejemplo n.º 4
0
def read_captions(captions, options):
    scc_reader = pycaption.SCCReader()
    srt_reader = pycaption.SRTReader()
    sami_reader = pycaption.SAMIReader()
    dfxp_reader = pycaption.DFXPReader()

    if scc_reader.detect(captions):
        if options.lang:
            return scc_reader.read(captions, lang=options.lang,
                                   offset=int(options.offset))
        else:
            return scc_reader.read(captions, offset=float(options.offset))
    elif srt_reader.detect(captions):
        return srt_reader.read(captions)
    elif sami_reader.detect(captions):
        return sami_reader.read(captions)
    elif dfxp_reader.detect(captions):
        return dfxp_reader.read(captions)
    else:
        raise Exception('No caption format detected :(')
Ejemplo n.º 5
0
    def download_from_ism(self, url, output_name, output_format):
        r = self.session.get(f'{url}/manifest')
        manifest = xmltodict.parse(r.content, force_list={'StreamIndex', 'c'})
        self.logger.debug(json.dumps(manifest, indent=4))

        for (index, stream) in enumerate(
                manifest['SmoothStreamingMedia']['StreamIndex']):
            if stream['@Type'] != 'text':
                continue

            lang = stream['@Language'].lower()

            fmt = stream['QualityLevel']['@FourCC'].upper()
            if fmt != 'TTML':
                self.logger.error(
                    f'Stream has unsupported subtitle format: {fmt!r}')
                sys.exit(1)

            index -= 2
            output = f'{output_name.replace(" ", ".")}.{lang}.{index}.srt'
            output = pathvalidate.sanitize_filename(output)
            output = os.path.join(self.output_dir, output)
            self.logger.info(f'Saving subtitle track #{index} to {output}')

            path = stream['@Url'].replace('{bitrate}',
                                          stream['QualityLevel']['@Bitrate'])
            t = 0
            ts = []

            for c in stream['c']:
                if c.get('@t'):
                    t = int(c['@t'])
                    ts.append(t)

                if not c.get('@d'):
                    # Stream only has a single segment
                    break

                for i in range(c.get('@r', 1)):
                    t += int(c['@d'])
                    ts.append(t)

            ts = ts[:-1]  # Remove nonexistent last segment

            xml = None

            for (i, t) in enumerate(ts):
                #print(f'\rDownloading: {t/ts[-1]:.0%}', end='')
                self.logger.debug(f'Downloading segment {i + 1} of {len(ts)}')
                seg_url = f'{url}/{path.replace("{start time}", str(t))}'
                seg = self.session.get(seg_url).content

                if not seg:
                    # Empty segment
                    continue

                data = self.ismt_to_ttml(seg).decode('utf-8')

                assert '{{BR}}' not in data, 'input data contains br placeholder'
                data = re.sub(r'<br ?/>', '{{BR}}', data)

                xml_seg = xmltodict.parse(
                    data,
                    force_list={'p'},
                    process_namespaces=True,
                    namespaces={
                        'http://www.w3.org/XML/1998/namespace': None,
                        'http://www.w3.org/2006/10/ttaf1': None,
                        'http://www.w3.org/2006/10/ttaf1#metadata': None,
                        'http://www.w3.org/2006/10/ttaf1#styling': None,
                    },
                )

                if i == 0:
                    xml = xml_seg

                    fps_base = xml['tt'].get('@ttp:frameRate')
                    fps_mult = xml['tt'].get('@ttp:frameRateMultiplier')

                    if xml['tt']['body']['div'] is None:
                        xml['tt']['body']['div'] = {'p': []}

                    if fps_base:
                        if fps_mult:
                            mult = [int(x) for x in fps_mult.split(' ')]
                            mult = truediv(*mult)
                        else:
                            mult = 1

                        fps = fps_base * fps_mult
                    else:
                        fps = 30  # Per TTML spec

                else:
                    div = xml_seg['tt']['body']['div']

                    if div is None:
                        # Empty subtitle file
                        continue

                    subs = div['p']

                    scale = int(stream['@TimeScale'])
                    offset = t / scale

                    for p in subs:
                        for a in ('@begin', '@end'):
                            tc = p[a]
                            (h, m, s, f) = [int(x) for x in tc.split(':')]
                            total = round(
                                h * 3600 + m * 60 + s + f / fps + offset, 3)
                            p[a] = f'{total}s'

                        begin = float(p['@begin'][:-1])
                        end = float(p['@end'][:-1])

                        if end < begin:
                            self.logger.error(
                                f'End time is earlier than start time ({end} < {begin})',
                            )
                            return

                    xml['tt']['body']['div']['p'].extend(subs)

            xml_data = xmltodict.unparse(xml)
            xml_data = xml_data.replace('{{BR}}', '<br />')

            os.makedirs(self.output_dir, exist_ok=True)

            with open(output, 'wb') as fd:
                if output_format == 'ttml':
                    fd.write(xml_data.encode('utf-8-sig'))
                elif output_format == 'srt':
                    self.logger.debug('Converting to SRT')
                    r = pycaption.DFXPReader().read(xml_data)
                    w = pycaption.SRTWriter().write(r)
                    fd.write(w.encode('utf-8-sig'))