def get_soup(url, allow_redirects=True): """Get new soup instance from url.""" tries = 0 while True: try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.68", } cookies = get_cookies() response = requests.get(url, headers=headers, cookies=cookies, allow_redirects=allow_redirects) # Couldn't find the site. Stop and return None if response.status_code != 200: return None soup = BeautifulSoup(response.text, "html.parser") except BaseException as e: logger.error("Got exception during html fetch.") traceback.print_exc() time.sleep(60) tries += 1 if tries > 3: raise e continue return soup
def get_soup(url, allow_redirects=True): """Get new soup instance from url.""" tries = 0 while True: try: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0", } cookies = get_cookies() response = requests.get(url, headers=headers, cookies=cookies, allow_redirects=allow_redirects) # Couldn't find the site. Stop and return None if response.status_code != 200: return None soup = BeautifulSoup(response.text, "html.parser") except BaseException as e: logger.error("Got exception during html fetch.") traceback.print_exc() time.sleep(60) tries += 1 if tries > 3: raise e continue return soup
def download_user_videos(session, user): """Download all videos of a user.""" video_viewkeys = get_user_video_viewkeys(user) # Try to get all uploaded videos video_upload_viewkeys = get_video_upload_viewkeys(user) # If that doesn't work, try to get all public uploaded videos if len(video_upload_viewkeys) == 0: video_upload_viewkeys = get_video_upload_viewkeys(user, True) viewkeys = set(video_viewkeys + video_upload_viewkeys) if len(viewkeys) == 0: logger.error(f"Found 0 videos for user {user.key}. Aborting") sys.exit(1) full_success = True logger.info(f"Found {len(viewkeys)} videos.") for viewkey in viewkeys: clip = Clip.get_or_create(session, viewkey, user) # The clip has already been downloaded, skip it. if clip.completed: if clip.title is not None and clip.extension is not None: target_path = get_clip_path(user.name, clip.title, clip.extension) link_duplicate(clip, target_path) if clip.user is None: clip.user = user session.commit() continue success, info = download_video(viewkey, user.name) if success: clip.title = info["title"] clip.tags = info["tags"] clip.cartegories = info["categories"] clip.completed = True clip.user = user clip.location = info["out_path"] clip.extension = info["ext"] logger.info(f"New video: {clip.title}") else: full_success = False session.commit() time.sleep(20) return full_success
def download_video(viewkey, name="single_videos"): """Download the video.""" # Decide which domain should be used, depending if the user has a premium account is_premium = os.path.exists("cookie_file") if is_premium: video_url = f"https://www.pornhubpremium.com/view_video.php?viewkey={viewkey}" else: video_url = f"https://www.pornhub.com/view_video.php?viewkey={viewkey}" options = { "outtmpl": f"~/pornhub/{name}/%(title)s.%(ext)s", "format": "best", "quiet": True, "retries": 3, "nooverwrites": False, "continuedl": True, # 'all_proxy': 'http://127.0.0.1:1087', # Uncomment this if you want to use proxy # "external_downloader": "aria2c", # Uncomment this if you know how to use aria2 } if is_premium: options["cookiefile"] = "cookie_file" ydl = youtube_dl.YoutubeDL(options) tries = 0 while True: try: logger.info(f"Start downloading: {video_url}") info = ydl.extract_info(video_url) info[ "out_path"] = f'~/pornhub/{name}/{info["title"]}.{info["ext"]}' return True, info except TypeError: # This is an error that seems to occurr from time to time # A short wait and retry often seems to fix the problem # This is something about pornhub not properly loading the video. logger.info("Got TypeError bug") time.sleep(20) tries += 1 # If this happens too many times, something else must be broken. if tries > 10: return False, None continue except DownloadError: # We got a download error. # Ignore for now and continue downloading the other videos logger.error( f"DownloadError: Failed to download video: {viewkey}.") return False, None time.sleep(6) return False, None
def get_user_info(key): """Get all necessary user information.""" user_type, url, soup = get_user_type_and_url(key) name = get_user_name_from_soup(soup, "user") if name is None: logger.error(f"Couldn't get user info for {key}") sys.exit(1) name = name.strip() name = name.replace(" ", "_") name = re.sub(r"[\W]+", "_", name) return { "type": user_type, "url": url, "name": name, }
def get_playlist_video_viewkeys(playlist): """Scrape all viewkeys of the playlist's videos.""" url = get_playlist_video_url(playlist.id) soup = get_soup(url) if soup is None: logger.error(f"Couldn't find site for playlist {playlist.id}") sys.exit(1) videos = soup.find(id="videoPlaylist") keys = [] for video in videos.find_all("li"): # Only get entries with data-video-vkey attribute # There exist some elements, which have programmatic purpose if video.has_attr("data-video-vkey"): keys.append(video["data-video-vkey"]) return keys
def get_playlist_info(playlist_id): """Get meta information from playlist website.""" url = get_playlist_video_url(playlist_id) soup = get_soup(url) if soup is None: logger.error("Got invalid response for playlist: {url}") sys.exit(1) header = soup.find(id="playlistTopHeader") if header is None: logger.info(f"Couldn't get info for playlist: {url}") check_logged_out(soup) sys.exit(1) title = header.find("span", {"id": "watchPlaylist"}) name = title.text.strip() name = name.replace(" ", "_") name = re.sub(r"[\W]+", "_", name) return {"name": name}
def download_playlist_videos(session, playlist): """Download all videos of a playlist.""" viewkeys = set(get_playlist_video_viewkeys(playlist)) if len(viewkeys) == 0: logger.error(f"Found 0 videos in playlist {Playlist.id}. Aborting") sys.exit(1) full_success = True logger.info(f"Found {len(viewkeys)} videos.") for viewkey in viewkeys: clip = Clip.get_or_create(session, viewkey) # The clip has already been downloaded, skip it. if clip.completed: if clip.title is not None and clip.extension is not None: target_path = get_clip_path(playlist.name, clip.title, clip.extension) link_duplicate(clip, target_path) continue success, info = download_video(viewkey, f"playlists/{playlist.name}") if success: clip.title = info["title"] clip.tags = info["tags"] clip.cartegories = info["categories"] clip.completed = True clip.location = info["out_path"] clip.extension = info["ext"] logger.info(f"New video: {clip.title}") else: full_success = False session.commit() time.sleep(20) return full_success
def get_channel_info(channel_id): """Get meta information from channel website.""" url = get_channel_video_url(channel_id) soup = get_soup(url) if soup is None: logger.error("Got invalid response for channel: {url}") sys.exit(1) profile = soup.find(id="channelsProfile") if profile is None: logger.info(f"Couldn't get info for channel: {url}") check_logged_out(soup) sys.exit(1) header = profile.find("div", {"class": "header"}) wrapper = profile.find("div", {"class": "bottomExtendedWrapper"}) title = profile.find("div", {"class": "title"}) name = title.find("h1").text.strip() name = name.replace(" ", "_") name = re.sub(r"[\W]+", "_", name) return {"name": name}
def check_logged_out(soup): """Check if we got logged out.""" enterPremium = soup.find("div", {"class": "enterPremium"}) if enterPremium: logger.error("Looks like we got logged out.")
def get_channel_viewkeys(channel): """Scrape all public viewkeys of the channel's videos.""" is_premium = os.path.exists("http_cookie_file") if is_premium: url = f"https://www.pornhubpremium.com/channels/{channel.id}/videos" else: url = f"https://www.pornhub.com/channels/{channel.id}/videos" soup = get_soup(url) if soup is None: logger.error(f"Failed to find video page for channel {channel.id}") check_logged_out(soup) sys.exit(1) pages = 1 hasNavigation = False hasEndlessScrolling = False # Some sites have a navigation at the bottom navigation = soup.find("div", {"class": "pagination3"}) if navigation is not None: children = navigation.findChildren("li", {"class": "page_number"}) pages = len(children) + 1 hasNavigation = True # Others have a button for "endless scrolling" # In that case we have to search as long as elif soup.find(id="moreDataBtnStream"): hasEndlessScrolling = True keys = [] current_page = 1 next_url = url while current_page <= pages: # Check if the next site has another "endless scrolling" button as qell # If that's the case, increase the counter if hasEndlessScrolling and soup.find(id="moreDataBtnStream"): pages += 1 logger.info(f"Crawling {next_url}") # Channel with normal video upload list videos = soup.find(id="showAllChanelVideos") if videos is None: logger.error(f"Couldn't find channel videos in site: {url}") check_logged_out(soup) sys.exit(1) for video in videos.find_all("li"): if video.has_attr("_vkey"): keys.append(video["_vkey"]) current_page += 1 next_url = url + f"?page={current_page}" time.sleep(4) soup = get_soup(next_url) # We couldn't get the next url. if soup is None: break return keys
def get_video_upload_viewkeys(user, public=False): """Scrape viewkeys from the user's user/videos/upload route.""" is_premium = os.path.exists("premium") if is_premium: url = ( f"https://www.pornhubpremium.com/{user.user_type}/{user.key}/videos/premium" ) else: url = f"https://www.pornhub.com/{user.user_type}/{user.key}/videos/upload" if public: if is_premium: url = f"https://www.pornhubpremium.com/{user.user_type}/{user.key}/videos/upload" else: url = f"https://www.pornhub.com/{user.user_type}/{user.key}/videos/public" soup = get_soup(url) if soup is None: logger.info(f"Nothing on {url}") return [] pages = 1 hasNavigation = False hasEndlessScrolling = False # Some sites have a navigation at the bottom navigation = soup.find("div", {"class": "pagination3"}) if navigation is not None: children = navigation.findChildren("li", {"class": "page_number"}) pages = len(children) + 1 hasNavigation = True # Others have a button for "endless scrolling" # In that case we have to search as long as elif soup.find(id="moreDataBtnStream"): hasEndlessScrolling = True keys = [] current_page = 1 next_url = url while current_page <= pages: # Check if the next site has another "endless scrolling" button as qell # If that's the case, increase the counter if hasEndlessScrolling and soup.find(id="moreDataBtnStream"): pages += 1 logger.info(f"Crawling {next_url}") videoSection = soup.find("div", {"class": "videoUList"}) pornstarVideoSection = soup.find(id="pornstarsVideoSection") claimedUploadedVideoSection = soup.find( id="claimedUploadedVideoSection") # Users with normal video upload list if videoSection is not None: videos = videoSection.find(id="moreData") # Users with pornstarVideoSection elif pornstarVideoSection is not None: videos = pornstarVideoSection # Dunno what this is elif claimedUploadedVideoSection is not None: videos = claimedUploadedVideoSection else: logger.error( f"Couldn't find video section on {next_url}. Did we log out?") if check_logged_out(soup): sys.exit(1) return [] for video in videos.find_all("li"): if video.has_attr("data-video-vkey"): keys.append(video["data-video-vkey"]) current_page += 1 next_url = url + f"?page={current_page}" time.sleep(4) soup = get_soup(next_url) # We couldn't get the next url. if soup is None: break return keys
def download_video(viewkey, name="single_videos"): """Download the video.""" # Decide which domain should be used, depending if the user has a premium account is_premium = os.path.exists("premium") if is_premium: video_url = f"https://www.pornhubpremium.com/view_video.php?viewkey={viewkey}" else: video_url = f"https://www.pornhub.com/view_video.php?viewkey={viewkey}" options = { "outtmpl": f"/data/Media/P**n/{name}/%(title)s.%(ext)s", "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", "merge-output-format": "mp4", "quiet": True, "retries": 3, "nooverwrites": False, "continuedl": True, "yes-playlist": True, "add-metadata": True, "external-downloader": "aria2c", "external-downloader-args": "--no-conf --file-allocation=none -x16 -s16 -j5 -k5M -c -R", } if is_premium: options["cookiefile"] = "cookie_file" ydl = youtube_dl.YoutubeDL(options) tries = 0 while True: try: logger.info(f"Start downloading: {video_url}") info = ydl.extract_info(video_url) info[ "out_path"] = f'/data/Media/P**n/{name}/{info["title"]}.{info["ext"]}' return True, info except TypeError: # This is an error that seems to occurr from time to time # A short wait and retry often seems to fix the problem # This is something about pornhub not properly loading the video. logger.info("Got TypeError bug") time.sleep(20) tries += 1 # If this happens too many times, something else must be broken. if tries > 10: return False, None continue except DownloadError: # We got a download error. # Ignore for now and continue downloading the other videos logger.error( f"DownloadError: Failed to download video: {viewkey}.") return False, None time.sleep(6) return False, None