def _ParseImagePage(self, html, page_url): if 'member_illust.php?mode=manga' in html: manga_url = page_url.replace('medium', 'manga') raise HydrusExceptions.MimeException( page_url + ' was manga, not a single image, so could not be downloaded.') if 'member_illust.php?mode=ugoira_view' in html: raise HydrusExceptions.MimeException( page_url + ' was ugoira, not a single image, so could not be downloaded.') soup = ClientParsing.GetSoup(html) # original_image = soup.find(class_='original-image') image_url = original_image[ 'data-src'] # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg # tags_parent = soup.find('section', class_='work-tags') # <a href="/search.php?s_mode=s_tag_full&word=%E3%83%8F%E3%83%B3%E3%83%89%E3%83%A1%E3%82%A4%E3%83%89" class="text">[unicode tag here]</a> tags = [ link.string for link in tags_parent.find_all('a', class_='text') ] user = soup.find('h1', class_='user') if user is not None: tags.append('creator:' + user.string) title_parent = soup.find('section', class_=re.compile('work-info')) if title_parent is not None: title = title_parent.find('h1', class_='title') if title is not None: tags.append('title:' + title.string) return (image_url, tags)
def CheckFFMPEGError( lines ): if len( lines ) == 0: raise HydrusExceptions.MimeException( 'Could not parse that file--no FFMPEG output given.' ) if "No such file or directory" in lines[-1]: raise IOError( "File not found!" ) if 'Invalid data' in lines[-1]: raise HydrusExceptions.MimeException( 'FFMPEG could not parse.' )
def GetFileInfo(path, mime=None): size = os.path.getsize(path) if size == 0: raise HydrusExceptions.SizeException('File is of zero length!') if mime is None: mime = GetMime(path) if mime not in HC.ALLOWED_MIMES: raise HydrusExceptions.MimeException('Filetype is not permitted!') width = None height = None duration = None num_frames = None num_words = None if mime in (HC.IMAGE_JPEG, HC.IMAGE_PNG, HC.IMAGE_GIF): ((width, height), duration, num_frames) = HydrusImageHandling.GetImageProperties(path) elif mime == HC.APPLICATION_FLASH: ((width, height), duration, num_frames) = HydrusFlashHandling.GetFlashProperties(path) elif mime in (HC.IMAGE_APNG, HC.VIDEO_AVI, HC.VIDEO_FLV, HC.VIDEO_WMV, HC.VIDEO_MOV, HC.VIDEO_MP4, HC.VIDEO_MKV, HC.VIDEO_WEBM, HC.VIDEO_MPEG): ((width, height), duration, num_frames) = HydrusVideoHandling.GetFFMPEGVideoProperties(path) elif mime == HC.APPLICATION_PDF: num_words = HydrusDocumentHandling.GetPDFNumWords(path) elif mime == HC.AUDIO_MP3: duration = HydrusAudioHandling.GetMP3Duration(path) elif mime == HC.AUDIO_OGG: duration = HydrusAudioHandling.GetOGGVorbisDuration(path) elif mime == HC.AUDIO_FLAC: duration = HydrusAudioHandling.GetFLACDuration(path) elif mime == HC.AUDIO_WMA: duration = HydrusAudioHandling.GetWMADuration(path) return (size, mime, width, height, duration, num_frames, num_words)
def CheckFFMPEGError(lines): if "No such file or directory" in lines[-1]: raise IOError("File not found!") if 'Invalid data' in lines[-1]: raise HydrusExceptions.MimeException('FFMPEG could not parse.')
def ParseFFMPEGFPS( lines ): try: line = ParseFFMPEGVideoLine( lines ) # get the frame rate possible_results = [] match = re.search("( [0-9]*.| )[0-9]* tbr", line) if match is not None: tbr = line[match.start():match.end()].split(' ')[1] tbr_fps_is_likely_garbage = match is None or tbr.endswith( 'k' ) or float( tbr ) > 60 if not tbr_fps_is_likely_garbage: possible_results.append( float( tbr ) ) # match = re.search("( [0-9]*.| )[0-9]* fps", line) if match is not None: fps = line[match.start():match.end()].split(' ')[1] fps_is_likely_garbage = match is None or fps.endswith( 'k' ) or float( fps ) > 60 if not fps_is_likely_garbage: possible_results.append( float( fps ) ) if len( possible_results ) == 0: return None else: # in some cases, fps is 0.77 and tbr is incorrectly 20. extreme values cause bad results. let's try erroring on the side of slow # tbh in these cases, the frame are prob going to get counted manually anyway due to no neat ints at the end, so nbd return min( possible_results ) except: raise HydrusExceptions.MimeException( 'Error estimating framerate!' )
def ParseFFMPEGDuration( lines ): # get duration (in seconds) # Duration: 00:00:02.46, start: 0.033000, bitrate: 1069 kb/s try: line = [ l for l in lines if 'Duration:' in l ][0] if 'Duration: N/A' in line: return None if 'start:' in line: m = re.search( '(start\\: )' + '-?[0-9]+\\.[0-9]*', line ) start_offset = float( line[ m.start() + 7 : m.end() ] ) if abs( start_offset ) > 1.0: # once had a file with start offset of 957499 seconds jej start_offset = 0 else: start_offset = 0 match = re.search("[0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9]", line) hms = map(float, line[match.start()+1:match.end()].split(':')) if len( hms ) == 1: duration = hms[0] elif len( hms ) == 2: duration = 60 * hms[0] + hms[1] elif len( hms ) ==3: duration = 3600 * hms[0] + 60 * hms[1] + hms[2] duration -= start_offset return duration except: raise HydrusExceptions.MimeException( 'Error reading duration!' )
def ParseFFMPEGVideoLine( lines ): # get the output line that speaks about video lines_video = [ l for l in lines if 'Video: ' in l and not ( 'Video: png' in l or 'Video: jpg' in l ) ] # mp3 says it has a 'png' video stream if len( lines_video ) == 0: raise HydrusExceptions.MimeException( 'Could not find video information!' ) line = lines_video[0] return line
def ParseFFMPEGMimeText(lines): try: (input_line, ) = [l for l in lines if l.startswith('Input #0')] # Input #0, matroska, webm, from 'm.mkv': text = input_line[10:] mime_text = text.split(', from')[0] return mime_text except: raise HydrusExceptions.MimeException('Error reading mime!')
def ParseFFMPEGNumFramesManually( lines ): try: frame_lines = [ l for l in lines if l.startswith( 'frame= ' ) ] l = frame_lines[-1] # there will be several of these, counting up as the file renders. we hence want the final one while ' ' in l: l = l.replace( ' ', ' ' ) num_frames = int( l.split( ' ' )[1] ) return num_frames except: raise HydrusExceptions.MimeException( 'Error counting number of frames!' )
def ParseFFMPEGFPS( lines ): try: line = ParseFFMPEGVideoLine( lines ) # get the frame rate match = re.search("( [0-9]*.| )[0-9]* tbr", line) if match is not None: fps = line[match.start():match.end()].split(' ')[1] tbr_fps_is_likely_garbage = match is None or fps.endswith( 'k' ) or float( fps ) > 60 if tbr_fps_is_likely_garbage: match = re.search("( [0-9]*.| )[0-9]* fps", line) if match is not None: fps = line[match.start():match.end()].split(' ')[1] fps_is_likely_garbage = match is None or fps.endswith( 'k' ) or float( fps ) > 60 if fps_is_likely_garbage: return None fps = float( fps ) return fps except: raise HydrusExceptions.MimeException( 'Error estimating framerate!' )
def ParseFFMPEGVideoResolution(lines): try: line = ParseFFMPEGVideoLine(lines) # get the size, of the form 460x320 (w x h) match = re.search(" [0-9]*x[0-9]*(,| )", line) resolution = list( map(int, line[match.start():match.end() - 1].split('x'))) sar_match = re.search(" SAR [0-9]*:[0-9]* ", line) if sar_match is not None: # ' SAR 2:3 ' sar_string = line[sar_match.start():sar_match.end()] # '2:3' sar_string = sar_string[5:-1] (sar_w, sar_h) = sar_string.split(':') (sar_w, sar_h) = (int(sar_w), int(sar_h)) (x, y) = resolution x *= sar_w x //= sar_h resolution = (x, y) return resolution except: raise HydrusExceptions.MimeException('Error parsing resolution!')
def Hydrusffmpeg_parse_infos(filename, print_infos=False): """Get file infos using ffmpeg. Returns a dictionnary with the fields: "video_found", "video_fps", "duration", "video_nframes", "video_duration" "audio_found", "audio_fps" "video_duration" is slightly smaller than "duration" to avoid fetching the uncomplete frames at the end, which raises an error. """ # open the file in a pipe, provoke an error, read output cmd = [FFMPEG_PATH, "-i", filename] is_GIF = filename.endswith('.gif') if is_GIF: if HC.PLATFORM_WINDOWS: cmd += ["-f", "null", "NUL"] else: cmd += ["-f", "null", "/dev/null"] try: proc = subprocess.Popen( cmd, bufsize=10**5, stdout=subprocess.PIPE, stderr=subprocess.PIPE, startupinfo=HydrusData.GetSubprocessStartupInfo()) except: if not os.path.exists(FFMPEG_PATH): raise Exception('FFMPEG was not found!') else: raise infos = proc.stderr.read().decode('utf8') proc.terminate() del proc if print_infos: # print the whole info text returned by FFMPEG HydrusData.Print(infos) lines = infos.splitlines() if "No such file or directory" in lines[-1]: raise IOError("%s not found ! Wrong path ?" % filename) if 'Invalid data' in lines[-1]: raise HydrusExceptions.MimeException('FFMPEG could not parse.') result = dict() # get duration (in seconds) # Duration: 00:00:02.46, start: 0.033000, bitrate: 1069 kb/s try: keyword = ('frame=' if is_GIF else 'Duration: ') line = [l for l in lines if keyword in l][0] if 'start:' in line: m = re.search('(start\\: )' + '-?[0-9]+\\.[0-9]*', line) start_offset = float(line[m.start() + 7:m.end()]) if abs( start_offset ) > 1.0: # once had a file with start offset of 957499 seconds jej start_offset = 0 else: start_offset = 0 match = re.search("[0-9][0-9]:[0-9][0-9]:[0-9][0-9].[0-9][0-9]", line) hms = map(float, line[match.start() + 1:match.end()].split(':')) if len(hms) == 1: result['duration'] = hms[0] elif len(hms) == 2: result['duration'] = 60 * hms[0] + hms[1] elif len(hms) == 3: result['duration'] = 3600 * hms[0] + 60 * hms[1] + hms[2] result['duration'] -= start_offset except: raise IOError("Error reading duration in file %s," % (filename) + "Text parsed: %s" % infos) try: (input_line, ) = [l for l in lines if l.startswith('Input #0')] # Input #0, matroska, webm, from 'm.mkv': text = input_line[10:] mime_text = text.split(', from')[0] result['mime_text'] = mime_text except: pass # get the output line that speaks about video lines_video = [ l for l in lines if ' Video: ' in l and not (' Video: png' in l or ' Video: jpg' in l) ] # mp3 says it has a 'png' video stream result['video_found'] = (lines_video != []) if result['video_found']: line = lines_video[0] # get the size, of the form 460x320 (w x h) match = re.search(" [0-9]*x[0-9]*(,| )", line) s = list(map(int, line[match.start():match.end() - 1].split('x'))) result['video_size'] = s # get the frame rate try: match = re.search("( [0-9]*.| )[0-9]* tbr", line) result['video_fps'] = float( line[match.start():match.end()].split(' ')[1]) except: match = re.search("( [0-9]*.| )[0-9]* fps", line) result['video_fps'] = float( line[match.start():match.end()].split(' ')[1]) num_frames = result['duration'] * result['video_fps'] if num_frames != int(num_frames): num_frames += 1 # rounding up result['video_nframes'] = int(num_frames) result['video_duration'] = result['duration'] # We could have also recomputed the duration from the number # of frames, as follows: # >>> result['video_duration'] = result['video_nframes'] / result['video_fps'] lines_audio = [l for l in lines if ' Audio: ' in l] result['audio_found'] = lines_audio != [] if result['audio_found']: line = lines_audio[0] try: match = re.search(" [0-9]* Hz", line) result['audio_fps'] = int(line[match.start() + 1:match.end()]) except: result['audio_fps'] = 'unknown' return result
def GetFFMPEGVideoProperties(path, count_frames_manually=False): lines = GetFFMPEGInfoLines(path, count_frames_manually) if not ParseFFMPEGHasVideo(lines): raise HydrusExceptions.MimeException( 'File did not appear to have a video stream!') resolution = ParseFFMPEGVideoResolution(lines) duration = ParseFFMPEGDuration(lines) if duration is None: fps = ParseFFMPEGFPS(lines) if fps is None: fps = 24 # screw it, let's just put one in there if not count_frames_manually: count_frames_manually = True lines = GetFFMPEGInfoLines(path, count_frames_manually) num_frames = ParseFFMPEGNumFramesManually(lines) duration = num_frames / float(fps) else: num_frames = None if not count_frames_manually: fps = ParseFFMPEGFPS(lines) it_was_accurate = fps is not None if it_was_accurate: num_frames = duration * fps if num_frames != int( num_frames ): # we want whole numbers--anything else suggests start_offset is off or whatever if os.path.getsize( path ) < 30 * 1048576: # but only defer to a super precise +/- 1-frame manual count in this case when the file is small it_was_accurate = False if not it_was_accurate: count_frames_manually = True lines = GetFFMPEGInfoLines(path, count_frames_manually) if count_frames_manually: try: num_frames = ParseFFMPEGNumFramesManually(lines) except HydrusExceptions.MimeException: if num_frames is None: raise duration_in_ms = int(duration * 1000) return (resolution, duration_in_ms, num_frames)
def GetFileInfo(path, mime=None): size = os.path.getsize(path) if size == 0: raise HydrusExceptions.SizeException('File is of zero length!') if mime is None: mime = GetMime(path) if mime not in HC.ALLOWED_MIMES: raise HydrusExceptions.MimeException('Filetype is not permitted!') width = None height = None duration = None num_frames = None num_words = None if mime in (HC.IMAGE_JPEG, HC.IMAGE_PNG, HC.IMAGE_GIF): ((width, height), duration, num_frames) = HydrusImageHandling.GetImageProperties(path) elif mime == HC.APPLICATION_FLASH: ((width, height), duration, num_frames) = HydrusFlashHandling.GetFlashProperties(path) elif mime in (HC.IMAGE_APNG, HC.VIDEO_AVI, HC.VIDEO_FLV, HC.VIDEO_WMV, HC.VIDEO_MOV, HC.VIDEO_MP4, HC.VIDEO_MKV, HC.VIDEO_WEBM, HC.VIDEO_MPEG): ((width, height), duration, num_frames) = HydrusVideoHandling.GetFFMPEGVideoProperties(path) elif mime == HC.APPLICATION_PDF: num_words = HydrusDocumentHandling.GetPDFNumWords(path) elif mime in HC.AUDIO: ffmpeg_lines = HydrusVideoHandling.GetFFMPEGInfoLines(path) duration_in_s = HydrusVideoHandling.ParseFFMPEGDuration(ffmpeg_lines) duration = int(duration_in_s * 1000) if width is not None and width < 0: width *= -1 if height is not None and height < 0: width *= -1 if duration is not None and duration < 0: duration *= -1 if num_frames is not None and num_frames < 0: num_frames *= -1 if num_words is not None and num_words < 0: num_words *= -1 return (size, mime, width, height, duration, num_frames, num_words)
def GetFileInfo(path, mime=None): size = os.path.getsize(path) if size == 0: raise HydrusExceptions.SizeException('File is of zero length!') if mime is None: mime = GetMime(path) if mime not in HC.ALLOWED_MIMES: if mime == HC.TEXT_HTML: raise HydrusExceptions.MimeException( 'Looks like HTML -- maybe the client needs to be taught how to parse this?' ) elif mime == HC.APPLICATION_UNKNOWN: raise HydrusExceptions.MimeException('Unknown filetype!') else: raise HydrusExceptions.MimeException('Filetype is not permitted!') width = None height = None duration = None num_frames = None num_words = None if mime in (HC.IMAGE_JPEG, HC.IMAGE_PNG, HC.IMAGE_GIF): ((width, height), duration, num_frames) = HydrusImageHandling.GetImageProperties(path, mime) elif mime == HC.APPLICATION_FLASH: ((width, height), duration, num_frames) = HydrusFlashHandling.GetFlashProperties(path) elif mime in (HC.IMAGE_APNG, HC.VIDEO_AVI, HC.VIDEO_FLV, HC.VIDEO_WMV, HC.VIDEO_MOV, HC.VIDEO_MP4, HC.VIDEO_MKV, HC.VIDEO_WEBM, HC.VIDEO_MPEG): ((width, height), duration, num_frames) = HydrusVideoHandling.GetFFMPEGVideoProperties(path) elif mime == HC.APPLICATION_PDF: num_words = HydrusDocumentHandling.GetPDFNumWords( path) # this now give None until a better solution can be found elif mime in HC.AUDIO: ffmpeg_lines = HydrusVideoHandling.GetFFMPEGInfoLines(path) duration_in_s = HydrusVideoHandling.ParseFFMPEGDuration(ffmpeg_lines) duration = int(duration_in_s * 1000) if width is not None and width < 0: width *= -1 if height is not None and height < 0: width *= -1 if duration is not None and duration < 0: duration *= -1 if num_frames is not None and num_frames < 0: num_frames *= -1 if num_words is not None and num_words < 0: num_words *= -1 return (size, mime, width, height, duration, num_frames, num_words)