Exemple #1
0
def _srt_gen_from_url(base_url, end_time=3660, verbose=True):

    dt = 60
    t0 = 0
    t1 = t0 + dt

    has_next = True
    first = True
    srt = ''

    last_end = 0.0
    while has_next:

        if verbose:
            print('fetching captions from ' +
                  base_url + '?t={}/{}'.format(t0, t1))

        if first:
            first = False
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})
            res.raise_for_status()

            srt = res.text.replace(u'\ufeff', '')

        else:
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})

            res.raise_for_status()

            srt = res.text

        t0 = t1 + 1
        t1 = t1 + dt
        has_next = t1 <= end_time

        if srt:

            cc = CaptionConverter()
            cc.read(srt, SRTReader())
            captions = cc.captions.get_captions(lang='en-US')

            if first:
                last_end = captions[-1].end

            else:
                for caption in captions:
                    caption.start += last_end
                    caption.end += last_end

                last_end = captions[-1].end

            srt = cc.write(SRTWriter())

            yield srt.replace('\n\n', ' \n\n')

        else:
            yield ''
Exemple #2
0
def toSrt(file_name):
	converter = CaptionConverter()
	transcript = open(file_name + '.xml')
	new_transcript = transcript.read().replace('&#x266A;', '')
	converter.read(unicode(new_transcript), DFXPReader())
	f = open(file_name + '.srt', 'w')
	string = converter.write(SRTWriter())
	f.write(string)
	f.close()
Exemple #3
0
def _srt_gen_from_url(base_url, end_time=3660, verbose=True):

    dt = 60
    t0 = 0
    t1 = t0 + dt

    has_next = True
    first = True
    srt = ''

    last_end = 0.0
    while has_next:

        if verbose:
            print('fetching captions from ' + base_url +
                  '?t={}/{}'.format(t0, t1))

        if first:
            first = False
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})
            res.raise_for_status()

            srt = res.text.replace(u'\ufeff', '')

        else:
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})

            res.raise_for_status()

            srt = res.text

        t0 = t1 + 1
        t1 = t1 + dt
        has_next = t1 <= end_time

        if srt:

            cc = CaptionConverter()
            cc.read(srt, SRTReader())
            captions = cc.captions.get_captions(lang='en-US')

            if first:
                last_end = captions[-1].end

            else:
                for caption in captions:
                    caption.start += last_end
                    caption.end += last_end

                last_end = captions[-1].end

            srt = cc.write(SRTWriter())

            yield srt.replace('\n\n', ' \n\n')

        else:
            yield ''
Exemple #4
0
def route_subtitles(course_id, lecture_id):
    subtitles_url = (
            'https://class.coursera.org/%s-001/lecture/subtitles?q=%d_en' %
            (course_id, lecture_id))
    r = requests.get(subtitles_url)
    try:
        converter = CaptionConverter()
        converter.read(r.text, SRTReader())
        subtitles = converter.write(WebVTTWriter())
    except CaptionReadNoCaptions:
        subtitles = ''
    return Response(subtitles, content_type='text/vtt')
Exemple #5
0
    def srt2ttml(srt_file_path, ttml_file_path=None):
        """Convert SubRip subtitles to TTML subtitles.

        Arguments:
            srt_file_path {string} -- The path to the SubRip file.
            ttml_file_path {string} -- The path to the TTML file.
        """

        converter = CaptionConverter()
        with open(srt_file_path, "r", encoding="utf8") as file:
            converter.read(file.read(), SRTReader())
        if ttml_file_path is None:
            ttml_file_path = srt_file_path.replace(".srt", ".xml")
        with open(ttml_file_path, "wb") as file:
            file.write(converter.write(DFXPWriter()).encode("utf-8"))
Exemple #6
0
    def ttml2srt(ttml_file_path, srt_file_path=None):
        """Convert TTML subtitles to SubRip subtitles.

        Arguments:
            ttml_file_path {string} -- The path to the TTML file.
            srt_file_path {string} -- The path to the SubRip file.
        """

        converter = CaptionConverter()
        with open(ttml_file_path, "r", encoding="utf8") as file:
            converter.read(file.read(), DFXPReader())
        if srt_file_path is None:
            srt_file_path = ttml_file_path.replace(".xml", ".srt")
        with open(srt_file_path, "wb") as file:
            file.write(converter.write(SRTWriter()).encode("utf-8"))
Exemple #7
0
def _make_ts_from_srt(srt):

    c = CaptionConverter()

    srt = re.sub('$', ' ', srt).replace('\n\n', ' \n\n')

    srt = unicodedata.normalize('NFC', srt)

    srt = ''.join(i for i in srt
                  if unicodedata.category(i)[0] != 'C' or i == '\n')

    c.read(srt, SRTReader())

    ts = c.write(TranscriptWriter()).replace(u'>>> ', u'>>').replace('\n', ' ')

    return ts.split('>>')
Exemple #8
0
def _make_ts_from_srt(srt):

    c = CaptionConverter()

    srt = re.sub('$', ' ', srt).replace('\n\n', ' \n\n')

    srt = unicodedata.normalize('NFC', srt)

    srt = ''.join(i for i in srt
                  if unicodedata.category(i)[0] != 'C' or i == '\n')

    c.read(srt, SRTReader())

    ts = c.write(TranscriptWriter()).replace(u'>>> ', u'>>').replace('\n', ' ')

    return ts.split('>>')
Exemple #9
0
def dfxpToSrt(myFile, keep, verbose):
    if verbose:
        print "--- Renaming to %s.dfxp" % myFile
    os.rename(myFile, "%s.dfxp" % myFile)
    print "--- Converting to srt"
    sourceFile = open("%s.dfxp" % myFile)  # open copy as source
    caps = sourceFile.read()  # read source
    converter = CaptionConverter()  # set pycaptions converter
    converter.read(caps, DFXPReader())  # read sami
    with open(myFile, "w") as targetFile:  # open target
        targetFile.write(converter.write(SRTWriter()))  # write target
    sourceFile.close()  # close source
    targetFile.close()  # close target
    if not keep:
        if verbose:
            print "--- Deleting temporary file %s.dfxp" % myFile
        os.remove("%s.dfxp" % myFile)
Exemple #10
0
def webvttToSrt(myFile, keep, verbose):
    if verbose:
        print "--- Renaming to %s.webvtt" % myFile
    os.rename(myFile, "%s.webvtt" % myFile)
    print "--- Converting to srt"
    sourceFile = codecs.open("%s.webvtt" % myFile, "r", encoding="utf8")  # open copy as source
    caps = sourceFile.read()  # read source
    converter = CaptionConverter()  # set pycaptions converter
    converter.read(caps, WebVTTReader())  # read sami
    with codecs.open(myFile, "w", encoding="utf8") as targetFile:  # open target
        targetFile.write(converter.write(SRTWriter()))  # write target
    sourceFile.close()  # close source
    targetFile.close()  # close target
    if not keep:
        if verbose:
            print "--- Deleting temporary file %s.webvtt" % myFile
        os.remove("%s.webvtt" % myFile)
Exemple #11
0
    def ttml2srt(ttml_file_path: str, srt_file_path: Optional[str] = None) -> None:
        """Convert TTML subtitles to SubRip subtitles.

        Arguments:
            ttml_file_path {string} -- The path to the TTML file.
            srt_file_path {string} -- The path to the SubRip file.
        """

        file: Union[TextIO, BinaryIO]
        converter = CaptionConverter()
        encoding = Utils.detect_encoding(ttml_file_path)
        with open(ttml_file_path, "r", encoding=encoding) as file:
            converter.read(file.read(), DFXPReader())
        if srt_file_path is None:
            srt_file_path = ttml_file_path.replace(".xml", ".srt")
        with open(srt_file_path, "wb") as file:
            file.write(converter.write(SRTWriter()).encode(encoding))
Exemple #12
0
    def srt2sami(srt_file_path: str, sami_file_path: Optional[str] = None) -> None:
        """Convert SubRip subtitles to SAMI subtitles.

        Arguments:
            srt_file_path {string} -- The path to the SubRip file.
            sami_file_path {string} -- The path to the SAMI file.
        """

        file: Union[TextIO, BinaryIO]
        converter = CaptionConverter()
        encoding = Utils.detect_encoding(srt_file_path)
        with open(srt_file_path, "r", encoding=encoding) as file:
            converter.read(file.read(), SRTReader())
        if sami_file_path is None:
            sami_file_path = srt_file_path.replace(".srt", ".smi")
        with open(sami_file_path, "wb") as file:
            file.write(converter.write(SAMIWriter()).encode(encoding))
Exemple #13
0
def main(argv):
    inputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:m", ["ifile="])
    except getopt.GetoptError:
        print('script2scc.py -i <inputfile>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('script2scc.py -i <inputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg

    file = open(inputfile, 'r', encoding='iso-8859-1')
    technoleads_content = data = file.read()
    converter = CaptionConverter()
    converter.read(technoleads_content, TechnoleadsReader(lang='fr'))
    # print (converter.write(WebVTTWriter()))
    print(converter.write(SCCWriter()))
Exemple #14
0
def subtitle(request, title, no):
    t = re.sub('\(.*?\)', '', title)[:-1]
    film = subscene.search(t, "English")

    zip = requests.get(subscene.zipped_url(film.subtitles[int(no)]))

    fp = StringIO(zip.content)
    archive = zipfile.ZipFile(fp, 'r')
    srt = archive.read(archive.namelist()[0])
    soup = BeautifulSoup(srt)
    # print(soup.originalEncoding)
    converter = CaptionConverter()
    unistring = unicode(srt.decode(soup.originalEncoding))
    if "utf-8" in soup.originalEncoding:
        unistring = unistring[1:]
    converter.read(unistring, SRTReader())
    html_parser = HTMLParser.HTMLParser()

    return HttpResponse(html_parser.unescape(converter.write(WebVTTWriter()).encode('ascii', 'ignore')),
                        content_type="text/vtt")
Exemple #15
0
def convert_subtitles_to_vtt(input_file: str, output_file: str):
    """Convert .srt subtitles to .vtt for web playback."""
    logger.info(f'Converting {input_file} to {output_file}')
    with open(input_file, mode='rb') as raw_input_content:
        encoding = chardet.detect(raw_input_content.read())['encoding']

    with open(input_file, mode='r', encoding=encoding) as srt_file:
        srt_contents = str(srt_file.read())

    converter = CaptionConverter()
    try:
        converter.read(srt_contents, SRTReader())
    except CaptionReadNoCaptions:
        logger.exception(f'Failed to convert {input_file} to {output_file}')
        return False  # Likely UTF-16 subtitles
    vtt_captions = converter.write(WebVTTWriter())

    with open(output_file, mode='w', encoding='utf-8-sig') as vtt_file:
        vtt_file.write(vtt_captions)

    return True
Exemple #16
0
def from_srt(input_f, output_f):
  """
    Takes an input SRT file or filename and writes out VTT contents to the given 
    output file or filename
  """
  with vtt_open(input_f, 'r') as f:
    orig = f.read()

    detect = chardet.detect(orig)
    encoding = detect['encoding']
    confidence = detect['confidence']
    default_subrip_encoding = 'cp1252' # standard for SubRip files

    if confidence < 0.9:
      encoding = default_subrip_encoding

    backups = [default_subrip_encoding,'utf8']

    while True:
      try:
        print "ENCODING: " + encoding
        contents = orig.decode(encoding)
        break
      except UnicodeDecodeError as e:
        if len(backups) is 0:
          raise
          break
        encoding = backups.pop(0)


    # caption converter seems to have a tough time with the BOM on
    # Python < 2.7.8, so ditch it if it exists.
    contents = contents[3:] if contents[:3] == codecs.BOM_UTF8 else contents

  converter = CaptionConverter()
  converter.read(contents, SRTReader())
  contents = converter.write(WebVTTWriter())

  with vtt_open(output_f, 'w') as o:
    o.write(contents.encode('utf-8')[:-1])
def convertText(inputfile):
    file_names = os.listdir(inputfile) 
    for sample_name in file_names:         
        with open(inputfile+"/" + sample_name, 'r') as f:
            read_data = f.read()
        f.closed
        read_data = read_data.decode("utf8")
        read_data = unicode(read_data)
        converter = CaptionConverter()
        converter.read(read_data, WebVTTReader())
        f = converter.write(SRTWriter())
        trythis = list(f)
        myre = '([0-9]){2}:([0-9]){2}:([0-9]){2},([0-9]){3}'
        myre2 = '([0-9])+'   
        char = ""        
        words = []
        for i in trythis:
            if i!="\n":
                char = char + i
            else:
                if re.search(myre, char): 
                    char = ""
                elif re.search(myre2, char):
                    char = ""
                elif char == '-->':
                    char = ""
                else:
                    words.append(char)
                    char = ""
                                
                    
        words =  set(words)
        f = " ".join(words)
        output_file = open("/Users/lavanyasunder1/Subtitles/TXT/%s.txt" % (sample_name), 'w+') 
        output_file.write(f.encode("utf-8")) 
        output_file.write("\n")
        output_file.close()
Exemple #18
0
from pycaption import SCCReader
from pycaption import WebVTTWriter
from pycaption import SCCWriter
from pycaption import CaptionConverter
from pycaption import TechnoleadsReader

from pprint import pprint

# file=open('journal.scc','r')
# scc_content = data=file.read()

# pycaps = SCCReader().read(scc_content, lang='fr')

# converter = CaptionConverter()
# converter.read(scc_content, SCCReader())
# print (converter.write(SCCWriter()))
# print (converter.write(WebVTTWriter()))

# pprint(pycaps)

file = open('conseiller_Le_GA00120742_MF0HP.txt', 'r', encoding='iso-8859-1')
technoleads_content = data = file.read()
converter = CaptionConverter()
converter.read(technoleads_content, TechnoleadsReader(lang='fr'))
# print (converter.write(WebVTTWriter()))
print(converter.write(SCCWriter()))