コード例 #1
0
    def convert(content, input_format, output_format):
        """
        Convert transcript `content` from `input_format` to `output_format`.

        Accepted input formats: sjson, srt.
        Accepted output format: srt, txt, sjson.

        Raises:
            TranscriptsGenerationException: On parsing the invalid srt content during conversion from srt to sjson.
        """
        assert input_format in ('srt', 'sjson')
        assert output_format in ('txt', 'srt', 'sjson')

        if input_format == output_format:
            return content

        if input_format == 'srt':
            # Standardize content into bytes for later decoding.
            if isinstance(content, str):
                content = content.encode('utf-8')

            if output_format == 'txt':
                text = SubRipFile.from_string(content.decode('utf-8')).text
                return html.unescape(text)

            elif output_format == 'sjson':
                try:
                    srt_subs = SubRipFile.from_string(
                        # Skip byte order mark(BOM) character
                        content.decode('utf-8-sig'),
                        error_handling=SubRipFile.ERROR_RAISE
                    )
                except Error as ex:   # Base exception from pysrt
                    raise TranscriptsGenerationException(str(ex)) from ex

                return json.dumps(generate_sjson_from_srt(srt_subs))

        if input_format == 'sjson':
            # If the JSON file content is bytes, try UTF-8, then Latin-1
            if isinstance(content, bytes):
                try:
                    content_str = content.decode('utf-8')
                except UnicodeDecodeError:
                    content_str = content.decode('latin-1')
            else:
                content_str = content
            try:
                content_dict = json.loads(content_str)
            except ValueError:
                truncated = content_str[:100].strip()
                log.exception(
                    f"Failed to convert {input_format} to {output_format} for {repr(truncated)}..."
                )
                content_dict = {"start": [1], "end": [2], "text": ["An error occured obtaining the transcript."]}
            if output_format == 'txt':
                text = content_dict['text']
                text_without_none = [line if line else '' for line in text]
                return html.unescape("\n".join(text_without_none))
            elif output_format == 'srt':
                return generate_srt_from_sjson(content_dict, speed=1.0)
コード例 #2
0
ファイル: test_srtfile.py プロジェクト: GunioRobot/pysrt
 def test_windows1252(self):
     srt_string = codecs.open(self.windows_path, encoding='windows-1252').read()
     srt_file = SubRipFile.from_string(srt_string, encoding='windows-1252', eol='\r\n')
     self.assertEquals(len(srt_file), 1332)
     self.assertEquals(srt_file.eol, '\r\n')
     self.assertRaises(UnicodeDecodeError, SubRipFile.open,
         self.utf8_path, encoding='ascii')
コード例 #3
0
    def convert(content, input_format, output_format):
        """
        Convert transcript `content` from `input_format` to `output_format`.

        Accepted input formats: sjson, srt.
        Accepted output format: srt, txt.
        """
        assert input_format in ('srt', 'sjson')
        assert output_format in ('txt', 'srt', 'sjson')

        if input_format == output_format:
            return content

        if input_format == 'srt':

            if output_format == 'txt':
                text = SubRipFile.from_string(content.decode('utf8')).text
                return HTMLParser().unescape(text)

            elif output_format == 'sjson':
                raise NotImplementedError

        if input_format == 'sjson':

            if output_format == 'txt':
                text = json.loads(content)['text']
                return HTMLParser().unescape("\n".join(text))

            elif output_format == 'srt':
                return generate_srt_from_sjson(json.loads(content), speed=1.0)
コード例 #4
0
def convert_srt_to_sjson(srt_data):
    """
    Convert SRT to SJSON

    Arguments:
        srt_data: unicode, content of source subs.

    Returns:
        dict: SJSON data
    """
    srt_subs_obj = SubRipFile.from_string(srt_data)

    sub_starts = []
    sub_ends = []
    sub_texts = []

    for sub in srt_subs_obj:
        sub_starts.append(sub.start.ordinal)
        sub_ends.append(sub.end.ordinal)
        sub_texts.append(sub.text.replace('\n', ' '))

    subs = {
        'start': sub_starts,
        'end': sub_ends,
        'text': sub_texts
    }

    return subs
コード例 #5
0
    def convert(content, input_format, output_format):
        """
        Convert transcript `content` from `input_format` to `output_format`.

        Accepted input formats: sjson, srt.
        Accepted output format: srt, txt.
        """
        assert input_format in ('srt', 'sjson')
        assert output_format in ('txt', 'srt', 'sjson')

        if input_format == output_format:
            return content

        if input_format == 'srt':

            if output_format == 'txt':
                text = SubRipFile.from_string(content.decode('utf8')).text
                return HTMLParser().unescape(text)

            elif output_format == 'sjson':
                raise NotImplementedError

        if input_format == 'sjson':

            if output_format == 'txt':
                text = json.loads(content)['text']
                return HTMLParser().unescape("\n".join(text))

            elif output_format == 'srt':
                return generate_srt_from_sjson(json.loads(content), speed=1.0)
コード例 #6
0
    def convert(content, input_format, output_format):
        """
        Convert transcript `content` from `input_format` to `output_format`.

        Accepted input formats: sjson, srt.
        Accepted output format: srt, txt, sjson.

        Raises:
            TranscriptsGenerationException: On parsing the invalid srt content during conversion from srt to sjson.
        """
        assert input_format in ('srt', 'sjson')
        assert output_format in ('txt', 'srt', 'sjson')

        if input_format == output_format:
            return content

        if input_format == 'srt':
            # Standardize content into bytes for later decoding.
            if isinstance(content, str):
                content = content.encode('utf-8')

            if output_format == 'txt':
                text = SubRipFile.from_string(content.decode('utf-8')).text
                return html.unescape(text)

            elif output_format == 'sjson':
                try:
                    srt_subs = SubRipFile.from_string(
                        # Skip byte order mark(BOM) character
                        content.decode('utf-8-sig'),
                        error_handling=SubRipFile.ERROR_RAISE
                    )
                except Error as ex:   # Base exception from pysrt
                    raise TranscriptsGenerationException(str(ex))  # lint-amnesty, pylint: disable=raise-missing-from

                return json.dumps(generate_sjson_from_srt(srt_subs))

        if input_format == 'sjson':

            if output_format == 'txt':
                text = json.loads(content)['text']
                text_without_none = [line if line else '' for line in text]
                return html.unescape("\n".join(text_without_none))

            elif output_format == 'srt':
                return generate_srt_from_sjson(json.loads(content), speed=1.0)
コード例 #7
0
    def convert(content, input_format, output_format):
        """
        Convert transcript `content` from `input_format` to `output_format`.

        Accepted input formats: sjson, srt.
        Accepted output format: srt, txt, sjson.

        Raises:
            TranscriptsGenerationException: On parsing the invalid srt content during conversion from srt to sjson.
        """
        assert input_format in ('srt', 'sjson')
        assert output_format in ('txt', 'srt', 'sjson')

        if input_format == output_format:
            return content

        if input_format == 'srt':

            if output_format == 'txt':
                text = SubRipFile.from_string(content.decode('utf8')).text
                return HTMLParser().unescape(text)

            elif output_format == 'sjson':
                try:
                    # With error handling (set to 'ERROR_RAISE'), we will be getting
                    # the exception if something went wrong in parsing the transcript.
                    srt_subs = SubRipFile.from_string(
                        # Skip byte order mark(BOM) character
                        content.decode('utf-8-sig'),
                        error_handling=SubRipFile.ERROR_RAISE
                    )
                except Error as ex:   # Base exception from pysrt
                    raise TranscriptsGenerationException(text_type(ex))

                return json.dumps(generate_sjson_from_srt(srt_subs))

        if input_format == 'sjson':

            if output_format == 'txt':
                text = json.loads(content)['text']
                text_without_none = [line if line else '' for line in text]
                return HTMLParser().unescape("\n".join(text_without_none))

            elif output_format == 'srt':
                return generate_srt_from_sjson(json.loads(content), speed=1.0)
コード例 #8
0
    def convert(content, input_format, output_format):
        """
        Convert transcript `content` from `input_format` to `output_format`.

        Accepted input formats: sjson, srt.
        Accepted output format: srt, txt, sjson.

        Raises:
            TranscriptsGenerationException: On parsing the invalid srt content during conversion from srt to sjson.
        """
        assert input_format in ('srt', 'sjson')
        assert output_format in ('txt', 'srt', 'sjson')

        if input_format == output_format:
            return content

        if input_format == 'srt':

            if output_format == 'txt':
                text = SubRipFile.from_string(content.decode('utf8')).text
                return HTMLParser().unescape(text)

            elif output_format == 'sjson':
                try:
                    # With error handling (set to 'ERROR_RAISE'), we will be getting
                    # the exception if something went wrong in parsing the transcript.
                    srt_subs = SubRipFile.from_string(
                        # Skip byte order mark(BOM) character
                        content.decode('utf-8-sig'),
                        error_handling=SubRipFile.ERROR_RAISE
                    )
                except Error as ex:   # Base exception from pysrt
                    raise TranscriptsGenerationException(text_type(ex))

                return json.dumps(generate_sjson_from_srt(srt_subs))

        if input_format == 'sjson':

            if output_format == 'txt':
                text = json.loads(content)['text']
                text_without_none = [line if line else '' for line in text]
                return HTMLParser().unescape("\n".join(text_without_none))

            elif output_format == 'srt':
                return generate_srt_from_sjson(json.loads(content), speed=1.0)
コード例 #9
0
ファイル: test_srtfile.py プロジェクト: ichernev/pysrt
 def test_windows1252(self):
     srt_string = codecs.open(self.windows_path,
                              encoding='windows-1252').read()
     srt_file = SubRipFile.from_string(srt_string,
                                       encoding='windows-1252',
                                       eol='\r\n')
     self.assertEquals(len(srt_file), 1332)
     self.assertEquals(srt_file.eol, '\r\n')
     self.assertRaises(UnicodeDecodeError,
                       SubRipFile.open,
                       self.utf8_path,
                       encoding='ascii')
コード例 #10
0
  def GetSrtCaptions(self):
    """Retrieves and parses the actual ASR captions track's data.

    Given the URL of an ASR captions track, this retrieves it in the SRT format
    and uses the pysrt library to parse it into a format we can manipulate.

    Raises:
      Error: The ASR caption track could not be retrieved.
    """
    response_headers, body = self.http.request("%s?fmt=srt" % self.track_url, "GET", headers=self.headers)

    if response_headers["status"] == "200":
      self.srt_captions = SubRipFile.from_string(body)
    else:
      raise Error("Received HTTP response %s when requesting %s?fmt=srt." % (response_headers["status"], self.track_url))
コード例 #11
0
def generate_subs_from_source(speed_subs, subs_type, subs_filedata, item, language='en'):
    """Generate transcripts from source files (like SubRip format, etc.)
    and save them to assets for `item` module.
    We expect, that speed of source subs equal to 1

    :param speed_subs: dictionary {speed: sub_id, ...}
    :param subs_type: type of source subs: "srt", ...
    :param subs_filedata:unicode, content of source subs.
    :param item: module object.
    :param language: str, language of translation of transcripts
    :returns: True, if all subs are generated and saved successfully.
    """
    _ = item.runtime.service(item, "i18n").ugettext
    if subs_type.lower() != 'srt':
        raise TranscriptsGenerationException(_("We support only SubRip (*.srt) transcripts format."))
    try:
        srt_subs_obj = SubRipFile.from_string(subs_filedata)
    except Exception as ex:
        msg = _("Something wrong with SubRip transcripts file during parsing. Inner message is {error_message}").format(
            error_message=ex.message
        )
        raise TranscriptsGenerationException(msg)
    if not srt_subs_obj:
        raise TranscriptsGenerationException(_("Something wrong with SubRip transcripts file during parsing."))

    sub_starts = []
    sub_ends = []
    sub_texts = []

    for sub in srt_subs_obj:
        sub_starts.append(sub.start.ordinal)
        sub_ends.append(sub.end.ordinal)
        sub_texts.append(sub.text.replace('\n', ' '))

    subs = {
        'start': sub_starts,
        'end': sub_ends,
        'text': sub_texts}

    for speed, subs_id in speed_subs.iteritems():
        save_subs_to_store(
            generate_subs(speed, 1, subs),
            subs_id,
            item,
            language
        )

    return subs
コード例 #12
0
def generate_subs_from_source(speed_subs, subs_type, subs_filedata, item, language='en'):
    """Generate transcripts from source files (like SubRip format, etc.)
    and save them to assets for `item` module.
    We expect, that speed of source subs equal to 1

    :param speed_subs: dictionary {speed: sub_id, ...}
    :param subs_type: type of source subs: "srt", ...
    :param subs_filedata:unicode, content of source subs.
    :param item: module object.
    :param language: str, language of translation of transcripts
    :returns: True, if all subs are generated and saved successfully.
    """
    _ = item.runtime.service(item, "i18n").ugettext
    if subs_type.lower() != 'srt':
        raise TranscriptsGenerationException(_("We support only SubRip (*.srt) transcripts format."))
    try:
        srt_subs_obj = SubRipFile.from_string(subs_filedata)
    except Exception as ex:
        msg = _("Something wrong with SubRip transcripts file during parsing. Inner message is {error_message}").format(
            error_message=text_type(ex)
        )
        raise TranscriptsGenerationException(msg)
    if not srt_subs_obj:
        raise TranscriptsGenerationException(_("Something wrong with SubRip transcripts file during parsing."))

    sub_starts = []
    sub_ends = []
    sub_texts = []

    for sub in srt_subs_obj:
        sub_starts.append(sub.start.ordinal)
        sub_ends.append(sub.end.ordinal)
        sub_texts.append(sub.text.replace('\n', ' '))

    subs = {
        'start': sub_starts,
        'end': sub_ends,
        'text': sub_texts}

    for speed, subs_id in six.iteritems(speed_subs):
        save_subs_to_store(
            generate_subs(speed, 1, subs),
            subs_id,
            item,
            language
        )

    return subs
コード例 #13
0
ファイル: utils.py プロジェクト: edx/edx-val
def get_transcript_format(transcript_content):
    """
    Returns transcript format.

    Arguments:
        transcript_content (str): Transcript file content.
    """
    try:
        sjson_obj = json.loads(transcript_content)
    except ValueError:
        # With error handling (set to 'ERROR_RAISE'), we will be getting
        # the exception if something went wrong in parsing the transcript.
        srt_subs = SubRipFile.from_string(transcript_content, error_handling=SubRipFile.ERROR_RAISE)
        if len(srt_subs) > 0:
            return TranscriptFormat.SRT
    return TranscriptFormat.SJSON
コード例 #14
0
ファイル: utils.py プロジェクト: lxp20201/lxp
def get_transcript_format(transcript_content):
    """
    Returns transcript format.

    Arguments:
        transcript_content (str): Transcript file content.
    """
    try:
        sjson_obj = json.loads(transcript_content)
    except ValueError:
        # With error handling (set to 'ERROR_RAISE'), we will be getting
        # the exception if something went wrong in parsing the transcript.
        srt_subs = SubRipFile.from_string(transcript_content, error_handling=SubRipFile.ERROR_RAISE)
        if len(srt_subs) > 0:
            return TranscriptFormat.SRT
    return TranscriptFormat.SJSON
コード例 #15
0
def generate_subs_from_source(speed_subs, subs_type, subs_filedata, item):
    """Generate transcripts from source files (like SubRip format, etc.)
    and save them to assets for `item` module.
    We expect, that speed of source subs equal to 1

    :param speed_subs: dictionary {speed: sub_id, ...}
    :param subs_type: type of source subs: "srt", ...
    :param subs_filedata:unicode, content of source subs.
    :param item: module object.
    :returns: True, if all subs are generated and saved successfully.
    """
    if subs_type != 'srt':
        raise TranscriptsGenerationException("We support only SubRip (*.srt) transcripts format.")
    try:
        srt_subs_obj = SubRipFile.from_string(subs_filedata)
    except Exception as e:
        raise TranscriptsGenerationException(
            "Something wrong with SubRip transcripts file during parsing. "
            "Inner message is {}".format(e.message)
        )
    if not srt_subs_obj:
        raise TranscriptsGenerationException("Something wrong with SubRip transcripts file during parsing.")

    sub_starts = []
    sub_ends = []
    sub_texts = []

    for sub in srt_subs_obj:
        sub_starts.append(sub.start.ordinal)
        sub_ends.append(sub.end.ordinal)
        sub_texts.append(sub.text.replace('\n', ' '))

    subs = {
        'start': sub_starts,
        'end': sub_ends,
        'text': sub_texts}

    for speed, subs_id in speed_subs.iteritems():
        save_subs_to_store(
            generate_subs(speed, 1, subs),
            subs_id,
            item
        )

    return subs
コード例 #16
0
def generate_subs_from_source(speed_subs, subs_type, subs_filedata, item):
    """Generate transcripts from source files (like SubRip format, etc.)
    and save them to assets for `item` module.
    We expect, that speed of source subs equal to 1

    :param speed_subs: dictionary {speed: sub_id, ...}
    :param subs_type: type of source subs: "srt", ...
    :param subs_filedata:unicode, content of source subs.
    :param item: module object.
    :returns: True, if all subs are generated and saved successfully.
    """
    if subs_type != 'srt':
        raise TranscriptsGenerationException("We support only SubRip (*.srt) transcripts format.")
    try:
        srt_subs_obj = SubRipFile.from_string(subs_filedata)
    except Exception as e:
        raise TranscriptsGenerationException(
            "Something wrong with SubRip transcripts file during parsing. "
            "Inner message is {}".format(e.message)
        )
    if not srt_subs_obj:
        raise TranscriptsGenerationException("Something wrong with SubRip transcripts file during parsing.")

    sub_starts = []
    sub_ends = []
    sub_texts = []

    for sub in srt_subs_obj:
        sub_starts.append(sub.start.ordinal)
        sub_ends.append(sub.end.ordinal)
        sub_texts.append(sub.text.replace('\n', ' '))

    subs = {
        'start': sub_starts,
        'end': sub_ends,
        'text': sub_texts}

    for speed, subs_id in speed_subs.iteritems():
        save_subs_to_store(
            generate_subs(speed, 1, subs),
            subs_id,
            item
        )

    return subs
コード例 #17
0
    def convert(cls, content, input_format, output_format):
        """
        Convert transcript `content` from `input_format` to `output_format`.

        Arguments:
            content: Transcript content byte-stream.
            input_format: Input transcript format.
            output_format: Output transcript format.

        Accepted input formats: sjson, srt.
        Accepted output format: srt, sjson.

        Raises:
            TranscriptsGenerationException: On parsing the invalid srt
            content during conversion from srt to sjson.
        """
        assert input_format in ('srt', 'sjson')
        assert output_format in ('srt', 'sjson')

        # Decode the content with utf-8-sig which will also
        # skip byte order mark(BOM) character if found.
        content = content.decode('utf-8-sig')

        if input_format == output_format:
            return content

        if input_format == 'srt':

            if output_format == 'sjson':
                try:
                    # With error handling (set to 'ERROR_RAISE'), we will be getting
                    # the exception if something went wrong in parsing the transcript.
                    srt_subs = SubRipFile.from_string(
                        content, error_handling=SubRipFile.ERROR_RAISE)
                except Error as ex:  # Base exception from pysrt
                    raise TranscriptsGenerationException(text_type(ex))

                return json.dumps(cls.generate_sjson_from_srt(srt_subs))

        if input_format == 'sjson':

            if output_format == 'srt':
                return cls.generate_srt_from_sjson(json.loads(content))
コード例 #18
0
    def GetSrtCaptions(self):
        """Retrieves and parses the actual ASR captions track's data.

    Given the URL of an ASR captions track, this retrieves it in the SRT format
    and uses the pysrt library to parse it into a format we can manipulate.

    Raises:
      Error: The ASR caption track could not be retrieved.
    """
        response_headers, body = self.http.request("%s?fmt=srt" %
                                                   self.track_url,
                                                   "GET",
                                                   headers=self.headers)

        if response_headers["status"] == "200":
            self.srt_captions = SubRipFile.from_string(body)
        else:
            raise Error(
                "Received HTTP response %s when requesting %s?fmt=srt." %
                (response_headers["status"], self.track_url))
コード例 #19
0
def get_srt_data(source):
    captions = SubRipFile.from_string(source.srt_data)
    for c in captions:
        start = c.start.to_time()
        end = c.end.to_time()
        offset = start.second + (start.minute * 60) + (start.hour * 60 * 60) + (start.microsecond / 1000000) #it can't possibly be more than hours.
        end_offset = end.second + (end.minute * 60) + (end.hour * 60 * 60) + (end.microsecond / 1000000)
        
        
        note, created = Note.objects.get_or_create(
            text = c.text,
            offset = end_offset,
            #end_offset = end_offset,
            user = source.user,
            user_name = source.user.username,
            video = source.video,
            private = False,
            import_source = source,
            import_source_name = source.name,
            source = 'SRT File',
            original_source = 'SRT File',
            source_link = source.url, #they're probably not going to have one of these...
            type = "caption"
        )
コード例 #20
0
ファイル: test_srtfile.py プロジェクト: yolesaber/pysrt
 def test_compare_from_string_and_from_path(self):
     unicode_content = codecs.open(self.utf8_path, encoding="utf_8").read()
     iterator = izip(SubRipFile.open(self.utf8_path), SubRipFile.from_string(unicode_content))
     for file_item, string_item in iterator:
         self.assertEquals(unicode(file_item), unicode(string_item))
コード例 #21
0
ファイル: test_srtfile.py プロジェクト: yolesaber/pysrt
 def test_utf8(self):
     unicode_content = codecs.open(self.utf8_path, encoding="utf_8").read()
     self.assertEquals(len(SubRipFile.from_string(unicode_content)), 1332)
     self.assertRaises(UnicodeDecodeError, SubRipFile.from_string, open(self.windows_path).read())
コード例 #22
0
ファイル: test_srtfile.py プロジェクト: ichernev/pysrt
 def test_utf8(self):
     unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read()
     self.assertEquals(len(SubRipFile.from_string(unicode_content)), 1332)
     self.assertRaises(UnicodeDecodeError, SubRipFile.from_string,
                       open(self.windows_path).read())
コード例 #23
0
ファイル: test_srtfile.py プロジェクト: ichernev/pysrt
 def test_compare_from_string_and_from_path(self):
     unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read()
     iterator = izip(SubRipFile.open(self.utf8_path),
                     SubRipFile.from_string(unicode_content))
     for file_item, string_item in iterator:
         self.assertEquals(unicode(file_item), unicode(string_item))