Example #1
0
 def is_valid(self):
     """Check if a subtitle text is a valid SubRip format"""
     try:
         pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
         return True
     except pysrt.Error as e:
         if e.args[0] > 80:
             return True
     except:
         logger.exception('Unexpected error when validating subtitle')
     return False
Example #2
0
 def is_valid(self):
     """Check if a subtitle text is a valid SubRip format"""
     try:
         pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
         return True
     except pysrt.Error as e:
         if e.args[0] > 80:
             return True
     except:
         logger.exception('Unexpected error when validating subtitle')
     return False
Example #3
0
    def is_valid(self):
        """Check if a :attr:`text` is a valid SubRip format.

        :return: whether or not the subtitle is valid.
        :rtype: bool

        """
        if self._is_valid:
            return True

        text = self.text
        if not text:
            return False

        # valid srt
        try:
            pysrt.from_string(text, error_handling=pysrt.ERROR_RAISE)
        except Exception:
            logger.error("PySRT-parsing failed, trying pysubs2")
        else:
            self._is_valid = True
            return True

        # something else, try to return srt
        try:
            logger.debug("Trying parsing with PySubs2")
            try:
                # in case of microdvd, try parsing the fps from the subtitle
                subs = pysubs2.SSAFile.from_string(text)
                if subs.format == "microdvd":
                    logger.info("Got FPS from MicroDVD subtitle: %s", subs.fps)
                else:
                    logger.info("Got format: %s", subs.format)
            except pysubs2.UnknownFPSError:
                # if parsing failed, use frame rate from provider
                sub_fps = self.get_fps()
                if not isinstance(sub_fps, float) or sub_fps < 10.0:
                    # or use our media file's fps as a fallback
                    sub_fps = self.plex_media_fps
                    logger.info(
                        "No FPS info in subtitle. Using our own media FPS for the MicroDVD subtitle: %s",
                        self.plex_media_fps)
                subs = pysubs2.SSAFile.from_string(text, fps=sub_fps)

            unicontent = self.pysubs2_to_unicode(subs)
            self.content = unicontent.encode(self.get_encoding())
        except:
            logger.exception("Couldn't convert subtitle %s to .srt format: %s",
                             self, traceback.format_exc())
            return False

        self._is_valid = True
        return True
Example #4
0
def is_valid_subtitle(subtitle_text):
    """Check if a subtitle text is a valid SubRip format

    :return: `True` if the subtitle is valid, `False` otherwise
    :rtype: bool

    """
    try:
        pysrt.from_string(subtitle_text, error_handling=pysrt.ERROR_RAISE)
        return True
    except pysrt.Error:
        return False
Example #5
0
def is_valid_subtitle(subtitle_text):
    """Check if a subtitle text is a valid SubRip format

    :return: `True` if the subtitle is valid, `False` otherwise
    :rtype: bool

    """
    try:
        pysrt.from_string(subtitle_text, error_handling=pysrt.ERROR_RAISE)
        return True
    except pysrt.Error:
        return False
Example #6
0
    def __init__(self, movie_sub_number):
        """ Inicia la clase, lee el archivo .srt y lo parsea """
        self.movie_sub_number = str(movie_sub_number)
        self.filename = CONFIG.subtitles_path + self.movie_sub_number + ".srt"
        self.all_frames = None

        # Lee el archivo srt
        with open(self.filename, "rb") as f:
            file_content = f.read()
            try:
                self.raw_sub = pysrt.from_string(file_content.decode("utf-8"))
            except Exception:
                self.raw_sub = pysrt.from_string(
                    file_content.decode("latin-1"))
Example #7
0
 def test_windows1252(self):
     srt_string = codecs.open(self.windows_path, encoding='windows-1252').read()
     srt_file = pysrt.from_string(srt_string, encoding='windows-1252', eol='\r\n')
     self.assertEquals(len(srt_file), 1332)
     self.assertEquals(srt_file.eol, '\r\n')
     self.assertRaises(UnicodeDecodeError, pysrt.open,
         self.utf8_path, encoding='ascii')
Example #8
0
def srt_to_html(srt_string):
    """
    Takes an str formatted string, and returns marked up html.
    Strips all timestamps; this is to simply render it neatly.
    """
    sequences = pysrt.from_string(srt_string.decode('utf-8'))
    return "\n".join(f"<p>{x.text}</p>" for x in sequences)
def is_valid_subtitle(subtitle_text):
    """Check if a subtitle text is a valid SubRip format

    :return: `True` if the subtitle is valid, `False` otherwise
    :rtype: bool

    """
    try:
        pysrt.from_string(subtitle_text, error_handling=pysrt.ERROR_RAISE)
        return True
    except pysrt.Error as e:
        if e.args[0] > 80:
            return True
    except:
        logger.exception('Unexpected error when validating subtitle')
    return False
Example #10
0
def is_valid_subtitle(subtitle_text):
    """Check if a subtitle text is a valid SubRip format

    :return: `True` if the subtitle is valid, `False` otherwise
    :rtype: bool

    """
    try:
        pysrt.from_string(subtitle_text, error_handling=pysrt.ERROR_RAISE)
        return True
    except pysrt.Error as e:
        if e.args[0] > 80:
            return True
    except:
        logger.exception('Unexpected error when validating subtitle')
    return False
Example #11
0
def download_subtitles(movie_title):
    # Creating subliminal.video object using movie title.
    video = subliminal.Video.fromname(movie_title)

    print("Downloading subtitles for '", movie_title, "'...", sep='')
    # Downloading subtitles for created video object. If several are
    # available, subtitles with higher rating will be chosen. All available
    # providers are used for searching.
    best_subtitles = \
        subliminal.download_best_subtitles({video}, {babelfish.Language('eng')})

    if not best_subtitles[video]:
        print("No subtitles found for '", movie_title, "'...", sep='')
        return []

    # This line can enable saving downloaded files for further use. Default
    # directory is the directory, where running script is located.
    # Note: when the script is running in non-sudo mode on Linux,
    # downloaded files will be saved in user Home directory.

    # subliminal.save_subtitles(video, [best_subtitles[video][0]])

    # Converting list of subtitles to string, so pysrt module can then convert
    # it to its own format.
    subtitles_string = ''
    for item in [best_subtitles[video][0]]:
        subtitles_string += item.text

    # Converting string to list of strings without any SRT-special content
    # (text only) and returning it.
    return pysrt.from_string(subtitles_string)
Example #12
0
    def is_valid(self):
        """Check if a :attr:`text` is a valid SubRip format.

        :return: whether or not the subtitle is valid.
        :rtype: bool

        """
        if self._is_valid:
            return True

        text = self.text
        if not text:
            return False

        # valid srt
        try:
            pysrt.from_string(text, error_handling=pysrt.ERROR_RAISE)
        except Exception:
            logger.error("PySRT-parsing failed, trying pysubs2")
        else:
            self._is_valid = True
            return True

        # something else, try to return srt
        try:
            logger.debug("Trying parsing with PySubs2")
            try:
                # in case of microdvd, try parsing the fps from the subtitle
                subs = pysubs2.SSAFile.from_string(text)
                if subs.format == "microdvd":
                    logger.info("Got FPS from MicroDVD subtitle: %s", subs.fps)
                else:
                    logger.info("Got format: %s", subs.format)
            except pysubs2.UnknownFPSError:
                # if parsing failed, suggest our media file's fps
                logger.info("No FPS info in subtitle. Using our own media FPS for the MicroDVD subtitle: %s",
                            self.plex_media_fps)
                subs = pysubs2.SSAFile.from_string(text, fps=self.plex_media_fps)

            unicontent = self.pysubs2_to_unicode(subs)
            self.content = unicontent.encode(self._guessed_encoding)
        except:
            logger.exception("Couldn't convert subtitle %s to .srt format: %s", self, traceback.format_exc())
            return False

        self._is_valid = True
        return True
Example #13
0
    def is_valid(self):
        """Check if a :attr:`text` is a valid SubRip format.

        :return: whether or not the subtitle is valid.
        :rtype: bool

        """
        if not self.text:
            return False

        try:
            pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
        except pysrt.Error as e:
            if e.args[0] < 80:
                return False

        return True
Example #14
0
    def is_valid(self):
        """Check if a :attr:`text` is a valid SubRip format.

        :return: whether or not the subtitle is valid.
        :rtype: bool

        """
        if not self.text:
            return False

        try:
            pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
        except pysrt.Error as e:
            if e.args[0] < 80:
                return False

        return True
Example #15
0
    def save_srt_to_file(self, srt_string):
        # create temp backup of file
        backup = self._basename + "-backup." + self._extension
        shutil.copyfile(self.filename, backup)
        print "Created backup at {}".format(backup)

        # print 'srt[100]: %s' % self.srt_string[:100]
        subs = pysrt.from_string(srt_string)
        subs.clean_indexes()
        subs.save(self.filename, encoding='utf-8')
Example #16
0
def reset_index(sub_unicode):
    subs = pysrt.from_string(sub_unicode)
    for i in range(1, len(subs) + 1):
        subs[i - 1].index = i

    new_sub = StringIO.StringIO()
    subs.write_into(new_sub)
    new_sub_unicode = new_sub.getvalue()
    new_sub.close()
    return new_sub_unicode
def get_srts(video_id):
    result_list = []

    try:

        part_url = urllib.urlencode(
            {'url': 'https://www.youtube.com/watch?v=' + video_id})
        url = URL + '?' + part_url

        content = get_url(url)

        dom = BeautifulSoup(content, 'lxml')

        eng_url = dom.find('div', {
            'id': 'show'
        }).find_all('b')[0].find_all('a')[0]['href'][2:]

        if not dom.find('div', {
                'id': 'show'
        }).contents[2].strip().startswith('English'):
            raise Exception('Correct language not found for video ' + video_id)

        url = URL + eng_url

        content = get_url(url)

        content = filter(lambda x: x in set(string.printable), content)

        subs = pysrt.from_string(content)

        num = 0
        for s in subs:
            result = {
                'videoId': video_id,
                'startMinutes': s.start.minutes,
                'endMinutes': s.end.minutes,
                'startSeconds': s.start.seconds,
                'endSeconds': s.end.seconds,
                'text': s.text_without_tags,
                'num': num
            }

            if (result['endMinutes'] < result['startMinutes']):
                result['endMinutes'] = result['startMinutes']
                result['endSeconds'] = result['startSeconds']

            result_list.append(result)
            num = num + 1

    except:
        print 'Unable to capture subtitles for video %s' % video_id
        traceback.print_exc()
        result_list = []

    return {'result_list': result_list}
Example #18
0
 def test_windows1252(self):
     srt_string = codecs.open(self.windows_path,
                              encoding='windows-1252').read()
     srt_file = pysrt.from_string(srt_string,
                                  encoding='windows-1252',
                                  eol='\r\n')
     self.assertEqual(len(srt_file), 1332)
     self.assertEqual(srt_file.eol, '\r\n')
     self.assertRaises(UnicodeDecodeError,
                       pysrt.open,
                       self.utf8_path,
                       encoding='ascii')
def generateSub(args, _subtitle, _filename):
    subs = pysrt.from_string(str(_subtitle).decode('utf-8'))
    output = args.OUTPUT + _filename
    #file = pysrt.SubRipFile()
    text = ''
    for index in range(len(subs)):
        if subs[index].text != '':
            if args.VERBOSE:
                print "Translating line:" + cleanhtml(subs[index].text)
            subs[index].text = translate(
                cleanhtml(subs[index].text).encode('utf-8'), args.LANG_TO,
                args.LANG_FROM)
    subs.save(output)
Example #20
0
def remove_ads_and_save(sub_contents, path):
    sub_contents = sub_contents.decode('iso-8859-15')
    srt_sub = pysrt.from_string(sub_contents)

    index = 0
    while index < len(srt_sub):
        srt_sub[index].index = index + 1

        sub_item = srt_sub[index]
        if True in [True for word in ADS_WORDS if word in sub_item.text.lower()]:
            del srt_sub[index]
        else:
            index += 1
    srt_sub.save(path, encoding='utf-8')
Example #21
0
def put_subtitles(video, subt_str):
    lines = pysrt.from_string(subt_str)
    for line in lines:
        start_time = datetime.time(line.start.hours,
                                   line.start.minutes,
                                   line.start.seconds,
                                   line.start.milliseconds * 1000)
        end_time = datetime.time(line.end.hours,
                                 line.end.minutes,
                                 line.end.seconds,
                                 line.end.milliseconds * 1000)
        subtitle = models.Subtitle(video=video,
                                   content=line.text,
                                   start_time=start_time,
                                   end_time=end_time)
        subtitle.save()
Example #22
0
def remove_ads_and_save(sub_contents, path):
    sub_contents = sub_contents.decode('iso-8859-15')
    srt_sub = pysrt.from_string(sub_contents)

    index = 0
    while index < len(srt_sub):
        srt_sub[index].index = index + 1

        sub_item = srt_sub[index]
        if True in [
                True for word in ADS_WORDS if word in sub_item.text.lower()
        ]:
            del srt_sub[index]
        else:
            index += 1
    srt_sub.save(path, encoding='utf-8')
Example #23
0
def load_transcript(video):
    if video.srt_extension == '':
        return None

    path = '/app/data/subs/orig/{}.{}.srt'.format(video.item_name(),
                                                  video.srt_extension)

    # TODO(wcrichto): small subset of documents are failing with utf8 decode errors
    try:
        subs = pysrt.from_string(open(path, 'rb').read().decode('utf-8'))
    except Exception:
        print(video.path)
        return None

    # In practice, seems like subs are usually about 5 seconds late, so this is a hand-tuned shift
    subs.shift(seconds=-5)

    return subs
Example #24
0
def srt_to_vtt(s, subs_shift=0):
    subs = pysrt.from_string(s)
    subs.shift(seconds=subs_shift)

    entry_fmt = '{position}\n{start} --> {end}\n{text}'

    def fmt_time(t):
        return '{:02d}:{:02d}:{:02d}.{:03d}'.format(t.hours, t.minutes,
                                                    t.seconds, t.milliseconds)

    entries = [
        entry_fmt.format(position=i,
                         start=fmt_time(sub.start),
                         end=fmt_time(sub.end),
                         text=sub.text) for i, sub in enumerate(subs)
    ]

    return '\n\n'.join(['WEBVTT'] + entries)
Example #25
0
def translate(request):
    # First get the srt data
    import pdb;pdb.set_trace()
    ret_dict = {}
    if request.method == 'POST':
        final_str = ''
        # f = request.FILES['subs_file']
        fs = unicode(request.POST.items()[0][1])
        subs = pysrt.from_string(fs)

        # Get the translated version
        st_to_tl = ''
        for sub in subs:
            st_to_tl += sub.text + ' | '

        st_to_tl = strip_tags(st_to_tl[:-3])
        st_list = do_translate(st_to_tl)

        for i, sub in enumerate(subs):
            try:
                next_start = subs[i+1].start
            except Exception as e:
                print e
                final_str +=' ' + st_list[i] + '.'
                break
            break_duration = next_start - sub.end
            break_duration = to_milliseconds(break_duration)
            final_str += st_list[i] + '<break time="' + unicode(break_duration) + 'ms"/>'
        ret_dict['success'] = '1'
        ret_dict['api_key'] = '59e482ac28dd52db23a22aff4ac1d31e'
        ssml = '<?xml version="1.0"?> ' + \
         '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" ' + \
         'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' + \
         'xsi:schemaLocation="http://www.w3.org/2001/10/synthesis ' + \
            'http://www.w3.org/TR/speech-synthesis/synthesis.xsd" ' + \
         'xml:lang="en-US">' + final_str + '</speak>'
        import pdb;pdb.set_trace()
        ssml = smart_str(ssml)
        ret_dict['ssml'] = ssml
        json_response = json.dumps(ret_dict)
    else:
        json_response = '{"Hello" : "world"}'

    return HttpResponse(json_response, content_type="application/json")
Example #26
0
def subsrt(srt, left, right, srt_padding):
    sliced = srt.slice(starts_after=left - srt_padding,
                       ends_before=right + srt_padding)
    if not sliced:
        return None

    # NOTE: The result of slice still references srt items in
    # the original srt. There seems no way a way to deep copy,
    # So export as a text and recreate from it.
    buf = StringIO()
    sliced.write_into(buf)
    ss = pysrt.from_string(buf.getvalue())

    # Do some modifications on it.
    ss.clean_indexes()
    ss.shift(milliseconds=-left)
    path = tempfile_path('.srt')
    ss.save(path, encoding='utf-8')
    return path
Example #27
0
def srt_to_vtt(s):
    subs = pysrt.from_string(s)
    subs.shift(
        seconds=-5)  # Seems like TV news captions are delayed by a few seconds

    entry_fmt = u'{position}\n{start} --> {end}\n{text}'

    def fmt_time(t):
        return u'{:02d}:{:02d}:{:02d}.{:03d}'.format(t.hours, t.minutes,
                                                     t.seconds, t.milliseconds)

    entries = [
        entry_fmt.format(position=i,
                         start=fmt_time(sub.start),
                         end=fmt_time(sub.end),
                         text=sub.text) for i, sub in enumerate(subs)
    ]

    return u'\n\n'.join([u'WEBVTT'] + entries)
Example #28
0
def reset_index(sub_unicode):
    '''Reset SRT subtitles index.

    The subtitle index increases incrementally from 1.

    Args:
        sub_unicode: unicode object containing SRT subtitles
    Returns:
        new_sub_unicode: Reordered unicode SRT object.

    '''
    subs = pysrt.from_string(sub_unicode)
    for i in range(1, len(subs) + 1):
        subs[i - 1].index = i

    new_sub = StringIO.StringIO()
    subs.write_into(new_sub)
    new_sub_unicode = new_sub.getvalue()
    new_sub.close()
    return new_sub_unicode
Example #29
0
def reset_index(sub_unicode):
    '''Reset SRT subtitles index.

    The subtitle index increases incrementally from 1.

    Args:
        sub_unicode: unicode object containing SRT subtitles
    Returns:
        new_sub_unicode: Reordered unicode SRT object.

    '''
    subs = pysrt.from_string(sub_unicode)
    for i in range(1, len(subs) + 1):
        subs[i - 1].index = i

    new_sub = StringIO.StringIO()
    subs.write_into(new_sub)
    new_sub_unicode = new_sub.getvalue()
    new_sub.close()
    return new_sub_unicode
Example #30
0
 def test_compare_from_string_and_from_path(self):
     unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read()
     iterator = zip(pysrt.open(self.utf8_path),
                    pysrt.from_string(unicode_content))
     for file_item, string_item in iterator:
         self.assertEqual(str(file_item), str(string_item))
Example #31
0
 def test_compare_from_string_and_from_path(self):
     unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read()
     iterator = izip(pysrt.open(self.utf8_path),
         pysrt.from_string(unicode_content))
     for file_item, string_item in iterator:
         self.assertEquals(unicode(file_item), unicode(string_item))
Example #32
0
def processFolder(foldercursor):
    filelist = glob.glob(foldercursor["address"]+srt_extension)
    folder_name=foldercursor["_id"]
    if len(filelist) == 0 : 
        movies_info_collection.update_one({'_id':folder_name},{"$set": {'finished':True}})
        movies_info_collection.update_one({'_id':folder_name},{"$set": {'srt_count':0}})
        return None #if no .srt file, skip

    movies_info_collection.update_one({'_id':folder_name},{"$set": {'srt_count':len(filelist)}})
    #=================================================
    #hint: reduce the read and write (io) to disk
    #=================================================
    movies_content_in_a_folder = []
    
    #create a list to store matched subtitles
    matched_content_in_a_folder = []
    for filename in filelist :
        try:
            
            with open(filename) as f : 
                
	        content = f.read()
                
                movie_content = {}
                
	        #print "check_encoding..."
                content_encoding = checkEncoding(content)
                if not content_encoding :
                    continue
                    
                #print "decoding..."
                movie_content["content"] = content.decode(content_encoding, 'ignore').encode("utf-8")
                
                #remove html tags
                pattern = re.compile('<[^>]*>|{[^}]*}')
                movie_content["content"]=pattern.sub('',movie_content["content"])
	        
                #print "check language..."
                movie_content["language"] = checkLanguage(movie_content["content"])
            
                #convert t-chinese to s-chinese
                #if movie_content["language"] == "zh" :
                #    except:
                #        pass
                #movie_content["content"] = opencc.convert(movie_content["content"]).encode("utf-8")
                
                #print "parsing srt..."
                movie_content["parsed_content"] = pysrt.from_string(movie_content["content"].decode("utf-8", 'ignore')) #pysrt.from_string(movie_content["content"],xencoding='utf_8') #pysrt.from_string(movie_content["content"].decode("utf-8", 'ignore'))
                
                movie_content["total_lines"] = len(movie_content["parsed_content"])
                
                
                movie_content["filename"] = filename
                
                movies_content_in_a_folder.append(movie_content)
        except Exception as e:
            print e

    #walk through all english subtitles
    
    #check if one srt file folder's srt file is zh and bilingual
    if len(movies_content_in_a_folder) == 1 :
        if movies_content_in_a_folder[0]['language'] == 'zh':
            if (checkBilingualzhSubtitles(movies_content_in_a_folder[0]['parsed_content'])):
                movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}})
    

    #TODO: change to set
    finished_en_subtitles = set()
    finished_zh_subtitles = set()
    
    #=================================================
    #start the iteration for every file in the folder
    #=================================================
    #check eng subtitles counts
    en_subtitle_counts = 0
    for en_movie_content in movies_content_in_a_folder:
        if en_movie_content["language"]!="en":
           continue
        else: en_subtitle_counts += 1
       
        if en_movie_content["total_lines"] in finished_en_subtitles:
           continue

        #print en_movie_content['filename']
         
        #print finished_en_subtitles
        #print finished_zh_subtitles

        #compare begins
        
        failed_zh_attempts=set() #record the failed zh srt file(s) for a en srt file
        # 1st, check the zh srt which has the same number of lines with this eng subtitle
        for zh_movie_content in movies_content_in_a_folder:
            if zh_movie_content["language"]!="zh" :
                continue           
            
            if zh_movie_content["total_lines"] != en_movie_content["total_lines"] : 
                continue           
            
            #this line is optional
            if zh_movie_content["total_lines"] in failed_zh_attempts :  
                continue   
                
            result = compare_subtitles(en_movie_content['parsed_content'], zh_movie_content['parsed_content'])
            
            if result :
                
                #if this is a bilingual subtitle, delete eng content in the zh srt file
                if (checkBilingualzhSubtitles(zh_movie_content['parsed_content'])):
                    for line in result:
                        line[3] = line[3].replace(line[2],"")
                    movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}})
                
                matched_content_in_a_folder = matched_content_in_a_folder + result
                finished_en_subtitles.add(en_movie_content["total_lines"])
                finished_zh_subtitles.add(zh_movie_content["total_lines"])
                movies_info_collection.update_one({'_id':folder_name},{"$set": {'matchedd':True}})
                break
            else : 
                failed_zh_attempts.add(zh_movie_content["total_lines"])

        # if this eng sub has been successfully processed, skip to next eng subtitle
        if en_movie_content["total_lines"] in finished_en_subtitles : 
            continue

        # if this eng sub has not been paired, check zh srt with lines other than the same lines         
        for zh_movie_content in movies_content_in_a_folder:
            if zh_movie_content["language"]!="zh":continue
            if zh_movie_content["total_lines"] in finished_zh_subtitles:continue#saves time, but optional
            if zh_movie_content["total_lines"] in failed_zh_attempts:continue
            
            #print "begin to compare:",en_movie_content['filename'], zh_movie_content['filename']
            


            result = compare_subtitles(en_movie_content['parsed_content'], zh_movie_content['parsed_content'])

            if result :
                
                #if this is a bilingual subtitle, delete eng content in the zh srt file
                if (checkBilingualzhSubtitles(zh_movie_content['parsed_content'])):
                    for line in result:
                        line[3] = line[3].replace(line[2],"")
                    movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}})
                
                
                matched_content_in_a_folder = matched_content_in_a_folder + result
                finished_en_subtitles.add(en_movie_content["total_lines"])
                finished_zh_subtitles.add(zh_movie_content["total_lines"])
                movies_info_collection.update_one({'_id':folder_name},{"$set": {'matched':True}})
                break
            else :
                failed_zh_attempts.add(zh_movie_content["total_lines"])

    #if there is no eng subtitles
    if en_subtitle_counts == 0:
        movies_info_collection.update_one({'_id':folder_name},{"$set": {'no_en_subtitle':True}})
        #check if there is bilingual zh-en subtitle
        for zh_movie_content in movies_content_in_a_folder:
            if zh_movie_content["language"]!="zh":
                continue
            if (checkBilingualzhSubtitles(zh_movie_content['parsed_content'])):
                movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}})
                break
    #TODO: output
    
    if matched_content_in_a_folder:
        for line in matched_content_in_a_folder:
            print "%s%f%f%s%s" % (folder_name.encode("utf8"), srttime2totaltime(line[0]), srttime2totaltime(line[1]), line[2], line[3])
    #mark this folder as processed
    movies_info_collection.update_one({'_id':folder_name},{"$set": {'finished':True}})
Example #33
0
	def read_dialogues(self):
		subs = pysrt.from_string(self.contents)
		buffer = ''
		for quote in subs:
			buffer += quote.text + '\n'
		return unicode(buffer)
Example #34
0
SUPPORTED_EXTENSIONS = [".xml", ".vtt"]

if __name__ == "__main__":
    directory = "."
    help_text = u"path to the {} directory (defaults to current directory)"
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        default=directory,
                        help=help_text.format("input", directory))
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        default=directory,
                        help=help_text.format("output", directory))
    a = parser.parse_args()
    filenames = [
        fn for fn in os.listdir(a.input)
        if fn[-4:].lower() in SUPPORTED_EXTENSIONS
    ]

    for fn in tqdm(filenames):
        with codecs.open("{}/{}".format(a.input, fn), 'rb', "utf-8") as f:
            text = f.read()
            text = to_srt(text, fn[-4:])
            text = strip_html(text)
            subs = stack_subs(pysrt.from_string(text))
            subs.save("{}/{}.srt".format(a.output, fn), encoding='utf-8')
Example #35
0
 def test_utf8(self):
     unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read()
     self.assertEqual(len(pysrt.from_string(unicode_content)), 1332)
     self.assertRaises(UnicodeDecodeError, open(self.windows_path).read)
Example #36
0
 def test_utf8(self):
     unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read()
     self.assertEquals(len(pysrt.from_string(unicode_content)), 1332)
     self.assertRaises(UnicodeDecodeError, pysrt.from_string,
         open(self.windows_path).read())
Example #37
0
 def test_windows1252(self):
     srt_string = codecs.open(self.windows_path, encoding="windows-1252").read()
     srt_file = pysrt.from_string(srt_string, encoding="windows-1252", eol="\r\n")
     self.assertEqual(len(srt_file), 1332)
     self.assertEqual(srt_file.eol, "\r\n")
     self.assertRaises(UnicodeDecodeError, pysrt.open, self.utf8_path, encoding="ascii")