Exemple #1
0
def _srt_gen_from_url(base_url, end_time=3660, verbose=True):

    dt = 60
    t0 = 0
    t1 = t0 + dt

    has_next = True
    first = True
    srt = ''

    last_end = 0.0
    while has_next:

        if verbose:
            print('fetching captions from ' + base_url +
                  '?t={}/{}'.format(t0, t1))

        if first:
            first = False
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})
            res.raise_for_status()

            srt = res.text.replace(u'\ufeff', '')

        else:
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})

            res.raise_for_status()

            srt = res.text

        t0 = t1 + 1
        t1 = t1 + dt
        has_next = t1 <= end_time

        if srt:

            cc = CaptionConverter()
            cc.read(srt, SRTReader())
            captions = cc.captions.get_captions(lang='en-US')

            if first:
                last_end = captions[-1].end

            else:
                for caption in captions:
                    caption.start += last_end
                    caption.end += last_end

                last_end = captions[-1].end

            srt = cc.write(SRTWriter())

            yield srt.replace('\n\n', ' \n\n')

        else:
            yield ''
Exemple #2
0
def route_subtitles(course_id, lecture_id):
    subtitles_url = (
            'https://class.coursera.org/%s-001/lecture/subtitles?q=%d_en' %
            (course_id, lecture_id))
    r = requests.get(subtitles_url)
    try:
        converter = CaptionConverter()
        converter.read(r.text, SRTReader())
        subtitles = converter.write(WebVTTWriter())
    except CaptionReadNoCaptions:
        subtitles = ''
    return Response(subtitles, content_type='text/vtt')
Exemple #3
0
    def ttml2srt(ttml_file_path, srt_file_path=None):
        """Convert TTML subtitles to SubRip subtitles.

        Arguments:
            ttml_file_path {string} -- The path to the TTML file.
            srt_file_path {string} -- The path to the SubRip file.
        """

        converter = CaptionConverter()
        with open(ttml_file_path, "r", encoding="utf8") as file:
            converter.read(file.read(), DFXPReader())
        if srt_file_path is None:
            srt_file_path = ttml_file_path.replace(".xml", ".srt")
        with open(srt_file_path, "wb") as file:
            file.write(converter.write(SRTWriter()).encode("utf-8"))
Exemple #4
0
    def srt2ttml(srt_file_path, ttml_file_path=None):
        """Convert SubRip subtitles to TTML subtitles.

        Arguments:
            srt_file_path {string} -- The path to the SubRip file.
            ttml_file_path {string} -- The path to the TTML file.
        """

        converter = CaptionConverter()
        with open(srt_file_path, "r", encoding="utf8") as file:
            converter.read(file.read(), SRTReader())
        if ttml_file_path is None:
            ttml_file_path = srt_file_path.replace(".srt", ".xml")
        with open(ttml_file_path, "wb") as file:
            file.write(converter.write(DFXPWriter()).encode("utf-8"))
Exemple #5
0
def _make_ts_from_srt(srt):

    c = CaptionConverter()

    srt = re.sub('$', ' ', srt).replace('\n\n', ' \n\n')

    srt = unicodedata.normalize('NFC', srt)

    srt = ''.join(i for i in srt
                  if unicodedata.category(i)[0] != 'C' or i == '\n')

    c.read(srt, SRTReader())

    ts = c.write(TranscriptWriter()).replace(u'>>> ', u'>>').replace('\n', ' ')

    return ts.split('>>')
Exemple #6
0
    def ttml2srt(ttml_file_path: str, srt_file_path: Optional[str] = None) -> None:
        """Convert TTML subtitles to SubRip subtitles.

        Arguments:
            ttml_file_path {string} -- The path to the TTML file.
            srt_file_path {string} -- The path to the SubRip file.
        """

        file: Union[TextIO, BinaryIO]
        converter = CaptionConverter()
        encoding = Utils.detect_encoding(ttml_file_path)
        with open(ttml_file_path, "r", encoding=encoding) as file:
            converter.read(file.read(), DFXPReader())
        if srt_file_path is None:
            srt_file_path = ttml_file_path.replace(".xml", ".srt")
        with open(srt_file_path, "wb") as file:
            file.write(converter.write(SRTWriter()).encode(encoding))
Exemple #7
0
    def srt2sami(srt_file_path: str, sami_file_path: Optional[str] = None) -> None:
        """Convert SubRip subtitles to SAMI subtitles.

        Arguments:
            srt_file_path {string} -- The path to the SubRip file.
            sami_file_path {string} -- The path to the SAMI file.
        """

        file: Union[TextIO, BinaryIO]
        converter = CaptionConverter()
        encoding = Utils.detect_encoding(srt_file_path)
        with open(srt_file_path, "r", encoding=encoding) as file:
            converter.read(file.read(), SRTReader())
        if sami_file_path is None:
            sami_file_path = srt_file_path.replace(".srt", ".smi")
        with open(sami_file_path, "wb") as file:
            file.write(converter.write(SAMIWriter()).encode(encoding))
Exemple #8
0
def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:m", ["ifile="])
    except getopt.GetoptError:
        print('script2scc.py -i <inputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('script2scc.py -i <inputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg

    file = open(inputfile, 'r', encoding='iso-8859-1')
    technoleads_content = data = file.read()
    converter = CaptionConverter()
    converter.read(technoleads_content, TechnoleadsReader(lang='fr'))
    # print (converter.write(WebVTTWriter()))
    print(converter.write(SCCWriter()))
Exemple #9
0
def subtitle(request, title, no):
    t = re.sub('\(.*?\)', '', title)[:-1]
    film = subscene.search(t, "English")

    zip = requests.get(subscene.zipped_url(film.subtitles[int(no)]))

    fp = StringIO(zip.content)
    archive = zipfile.ZipFile(fp, 'r')
    srt = archive.read(archive.namelist()[0])
    soup = BeautifulSoup(srt)
    # print(soup.originalEncoding)
    converter = CaptionConverter()
    unistring = unicode(srt.decode(soup.originalEncoding))
    if "utf-8" in soup.originalEncoding:
        unistring = unistring[1:]
    converter.read(unistring, SRTReader())
    html_parser = HTMLParser.HTMLParser()

    return HttpResponse(html_parser.unescape(converter.write(WebVTTWriter()).encode('ascii', 'ignore')),
                        content_type="text/vtt")
Exemple #10
0
def convert_subtitles_to_vtt(input_file: str, output_file: str):
    """Convert .srt subtitles to .vtt for web playback."""
    logger.info(f'Converting {input_file} to {output_file}')
    with open(input_file, mode='rb') as raw_input_content:
        encoding = chardet.detect(raw_input_content.read())['encoding']

    with open(input_file, mode='r', encoding=encoding) as srt_file:
        srt_contents = str(srt_file.read())

    converter = CaptionConverter()
    try:
        converter.read(srt_contents, SRTReader())
    except CaptionReadNoCaptions:
        logger.exception(f'Failed to convert {input_file} to {output_file}')
        return False  # Likely UTF-16 subtitles
    vtt_captions = converter.write(WebVTTWriter())

    with open(output_file, mode='w', encoding='utf-8-sig') as vtt_file:
        vtt_file.write(vtt_captions)

    return True
Exemple #11
0
def from_srt(input_f, output_f):
  """
    Takes an input SRT file or filename and writes out VTT contents to the given 
    output file or filename
  """
  with vtt_open(input_f, 'r') as f:
    orig = f.read()

    detect = chardet.detect(orig)
    encoding = detect['encoding']
    confidence = detect['confidence']
    default_subrip_encoding = 'cp1252' # standard for SubRip files

    if confidence < 0.9:
      encoding = default_subrip_encoding

    backups = [default_subrip_encoding,'utf8']

    while True:
      try:
        print "ENCODING: " + encoding
        contents = orig.decode(encoding)
        break
      except UnicodeDecodeError as e:
        if len(backups) is 0:
          raise
          break
        encoding = backups.pop(0)


    # caption converter seems to have a tough time with the BOM on
    # Python < 2.7.8, so ditch it if it exists.
    contents = contents[3:] if contents[:3] == codecs.BOM_UTF8 else contents

  converter = CaptionConverter()
  converter.read(contents, SRTReader())
  contents = converter.write(WebVTTWriter())

  with vtt_open(output_f, 'w') as o:
    o.write(contents.encode('utf-8')[:-1])
Exemple #12
0
from pycaption import SCCReader
from pycaption import WebVTTWriter
from pycaption import SCCWriter
from pycaption import CaptionConverter
from pycaption import TechnoleadsReader

from pprint import pprint

# file=open('journal.scc','r')
# scc_content = data=file.read()

# pycaps = SCCReader().read(scc_content, lang='fr')

# converter = CaptionConverter()
# converter.read(scc_content, SCCReader())
# print (converter.write(SCCWriter()))
# print (converter.write(WebVTTWriter()))

# pprint(pycaps)

file = open('conseiller_Le_GA00120742_MF0HP.txt', 'r', encoding='iso-8859-1')
technoleads_content = data = file.read()
converter = CaptionConverter()
converter.read(technoleads_content, TechnoleadsReader(lang='fr'))
# print (converter.write(WebVTTWriter()))
print(converter.write(SCCWriter()))