Beispiel #1
0
def get_soup(url, allow_redirects=True):
    """Get new soup instance from url."""
    tries = 0
    while True:
        try:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.68",
            }
            cookies = get_cookies()
            response = requests.get(url,
                                    headers=headers,
                                    cookies=cookies,
                                    allow_redirects=allow_redirects)

            # Couldn't find the site. Stop and return None
            if response.status_code != 200:
                return None

            soup = BeautifulSoup(response.text, "html.parser")
        except BaseException as e:
            logger.error("Got exception during html fetch.")
            traceback.print_exc()
            time.sleep(60)
            tries += 1

            if tries > 3:
                raise e
            continue

        return soup
Beispiel #2
0
def get_soup(url, allow_redirects=True):
    """Get new soup instance from url."""
    tries = 0
    while True:
        try:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0",
            }
            cookies = get_cookies()
            response = requests.get(url,
                                    headers=headers,
                                    cookies=cookies,
                                    allow_redirects=allow_redirects)

            # Couldn't find the site. Stop and return None
            if response.status_code != 200:
                return None

            soup = BeautifulSoup(response.text, "html.parser")
        except BaseException as e:
            logger.error("Got exception during html fetch.")
            traceback.print_exc()
            time.sleep(60)
            tries += 1

            if tries > 3:
                raise e
            continue

        return soup
Beispiel #3
0
def download_user_videos(session, user):
    """Download all videos of a user."""
    video_viewkeys = get_user_video_viewkeys(user)

    # Try to get all uploaded videos
    video_upload_viewkeys = get_video_upload_viewkeys(user)
    # If that doesn't work, try to get all public uploaded videos
    if len(video_upload_viewkeys) == 0:
        video_upload_viewkeys = get_video_upload_viewkeys(user, True)

    viewkeys = set(video_viewkeys + video_upload_viewkeys)

    if len(viewkeys) == 0:
        logger.error(f"Found 0 videos for user {user.key}. Aborting")
        sys.exit(1)

    full_success = True

    logger.info(f"Found {len(viewkeys)} videos.")
    for viewkey in viewkeys:
        clip = Clip.get_or_create(session, viewkey, user)

        # The clip has already been downloaded, skip it.
        if clip.completed:
            if clip.title is not None and clip.extension is not None:
                target_path = get_clip_path(user.name, clip.title,
                                            clip.extension)
                link_duplicate(clip, target_path)

            if clip.user is None:
                clip.user = user
                session.commit()

            continue

        success, info = download_video(viewkey, user.name)
        if success:
            clip.title = info["title"]
            clip.tags = info["tags"]
            clip.cartegories = info["categories"]
            clip.completed = True
            clip.user = user
            clip.location = info["out_path"]
            clip.extension = info["ext"]

            logger.info(f"New video: {clip.title}")
        else:
            full_success = False

        session.commit()
        time.sleep(20)

    return full_success
Beispiel #4
0
def download_video(viewkey, name="single_videos"):
    """Download the video."""
    # Decide which domain should be used, depending if the user has a premium account
    is_premium = os.path.exists("cookie_file")
    if is_premium:
        video_url = f"https://www.pornhubpremium.com/view_video.php?viewkey={viewkey}"
    else:
        video_url = f"https://www.pornhub.com/view_video.php?viewkey={viewkey}"

    options = {
        "outtmpl": f"~/pornhub/{name}/%(title)s.%(ext)s",
        "format": "best",
        "quiet": True,
        "retries": 3,
        "nooverwrites": False,
        "continuedl": True,
        # 'all_proxy': 'http://127.0.0.1:1087', # Uncomment this if you want to use proxy
        # "external_downloader": "aria2c", # Uncomment this if you know how to use aria2
    }
    if is_premium:
        options["cookiefile"] = "cookie_file"

    ydl = youtube_dl.YoutubeDL(options)
    tries = 0
    while True:
        try:
            logger.info(f"Start downloading: {video_url}")
            info = ydl.extract_info(video_url)
            info[
                "out_path"] = f'~/pornhub/{name}/{info["title"]}.{info["ext"]}'
            return True, info
        except TypeError:
            # This is an error that seems to occurr from time to time
            # A short wait and retry often seems to fix the problem
            # This is something about pornhub not properly loading the video.
            logger.info("Got TypeError bug")
            time.sleep(20)
            tries += 1

            # If this happens too many times, something else must be broken.
            if tries > 10:
                return False, None
            continue
        except DownloadError:
            # We got a download error.
            # Ignore for now and continue downloading the other videos
            logger.error(
                f"DownloadError: Failed to download video: {viewkey}.")
            return False, None

        time.sleep(6)
    return False, None
Beispiel #5
0
def get_user_info(key):
    """Get all necessary user information."""
    user_type, url, soup = get_user_type_and_url(key)
    name = get_user_name_from_soup(soup, "user")
    if name is None:
        logger.error(f"Couldn't get user info for {key}")
        sys.exit(1)

    name = name.strip()
    name = name.replace(" ", "_")
    name = re.sub(r"[\W]+", "_", name)

    return {
        "type": user_type,
        "url": url,
        "name": name,
    }
Beispiel #6
0
def get_playlist_video_viewkeys(playlist):
    """Scrape all viewkeys of the playlist's videos."""
    url = get_playlist_video_url(playlist.id)
    soup = get_soup(url)
    if soup is None:
        logger.error(f"Couldn't find site for playlist {playlist.id}")
        sys.exit(1)

    videos = soup.find(id="videoPlaylist")

    keys = []
    for video in videos.find_all("li"):
        # Only get entries with data-video-vkey attribute
        # There exist some elements, which have programmatic purpose
        if video.has_attr("data-video-vkey"):
            keys.append(video["data-video-vkey"])

    return keys
Beispiel #7
0
def get_playlist_info(playlist_id):
    """Get meta information from playlist website."""
    url = get_playlist_video_url(playlist_id)
    soup = get_soup(url)
    if soup is None:
        logger.error("Got invalid response for playlist: {url}")
        sys.exit(1)

    header = soup.find(id="playlistTopHeader")
    if header is None:
        logger.info(f"Couldn't get info for playlist: {url}")
        check_logged_out(soup)
        sys.exit(1)

    title = header.find("span", {"id": "watchPlaylist"})
    name = title.text.strip()

    name = name.replace(" ", "_")
    name = re.sub(r"[\W]+", "_", name)

    return {"name": name}
Beispiel #8
0
def download_playlist_videos(session, playlist):
    """Download all videos of a playlist."""
    viewkeys = set(get_playlist_video_viewkeys(playlist))

    if len(viewkeys) == 0:
        logger.error(f"Found 0 videos in playlist {Playlist.id}. Aborting")
        sys.exit(1)

    full_success = True

    logger.info(f"Found {len(viewkeys)} videos.")
    for viewkey in viewkeys:
        clip = Clip.get_or_create(session, viewkey)

        # The clip has already been downloaded, skip it.
        if clip.completed:
            if clip.title is not None and clip.extension is not None:
                target_path = get_clip_path(playlist.name, clip.title,
                                            clip.extension)
                link_duplicate(clip, target_path)

            continue

        success, info = download_video(viewkey, f"playlists/{playlist.name}")
        if success:
            clip.title = info["title"]
            clip.tags = info["tags"]
            clip.cartegories = info["categories"]
            clip.completed = True
            clip.location = info["out_path"]
            clip.extension = info["ext"]

            logger.info(f"New video: {clip.title}")
        else:
            full_success = False

        session.commit()
        time.sleep(20)

    return full_success
Beispiel #9
0
def get_channel_info(channel_id):
    """Get meta information from channel website."""
    url = get_channel_video_url(channel_id)
    soup = get_soup(url)
    if soup is None:
        logger.error("Got invalid response for channel: {url}")
        sys.exit(1)

    profile = soup.find(id="channelsProfile")
    if profile is None:
        logger.info(f"Couldn't get info for channel: {url}")
        check_logged_out(soup)
        sys.exit(1)

    header = profile.find("div", {"class": "header"})
    wrapper = profile.find("div", {"class": "bottomExtendedWrapper"})
    title = profile.find("div", {"class": "title"})
    name = title.find("h1").text.strip()

    name = name.replace(" ", "_")
    name = re.sub(r"[\W]+", "_", name)

    return {"name": name}
Beispiel #10
0
def check_logged_out(soup):
    """Check if we got logged out."""
    enterPremium = soup.find("div", {"class": "enterPremium"})
    if enterPremium:
        logger.error("Looks like we got logged out.")
Beispiel #11
0
def get_channel_viewkeys(channel):
    """Scrape all public viewkeys of the channel's videos."""
    is_premium = os.path.exists("http_cookie_file")
    if is_premium:
        url = f"https://www.pornhubpremium.com/channels/{channel.id}/videos"
    else:
        url = f"https://www.pornhub.com/channels/{channel.id}/videos"

    soup = get_soup(url)
    if soup is None:
        logger.error(f"Failed to find video page for channel {channel.id}")
        check_logged_out(soup)
        sys.exit(1)

    pages = 1
    hasNavigation = False
    hasEndlessScrolling = False

    # Some sites have a navigation at the bottom
    navigation = soup.find("div", {"class": "pagination3"})
    if navigation is not None:
        children = navigation.findChildren("li", {"class": "page_number"})
        pages = len(children) + 1
        hasNavigation = True
    # Others have a button for "endless scrolling"
    # In that case we have to search as long as
    elif soup.find(id="moreDataBtnStream"):
        hasEndlessScrolling = True

    keys = []
    current_page = 1
    next_url = url
    while current_page <= pages:
        # Check if the next site has another "endless scrolling" button as qell
        # If that's the case, increase the counter
        if hasEndlessScrolling and soup.find(id="moreDataBtnStream"):
            pages += 1

        logger.info(f"Crawling {next_url}")
        # Channel with normal video upload list
        videos = soup.find(id="showAllChanelVideos")

        if videos is None:
            logger.error(f"Couldn't find channel videos in site: {url}")
            check_logged_out(soup)
            sys.exit(1)

        for video in videos.find_all("li"):
            if video.has_attr("_vkey"):
                keys.append(video["_vkey"])

        current_page += 1
        next_url = url + f"?page={current_page}"

        time.sleep(4)

        soup = get_soup(next_url)
        # We couldn't get the next url.
        if soup is None:
            break

    return keys
Beispiel #12
0
def get_video_upload_viewkeys(user, public=False):
    """Scrape viewkeys from the user's user/videos/upload route."""
    is_premium = os.path.exists("premium")
    if is_premium:
        url = (
            f"https://www.pornhubpremium.com/{user.user_type}/{user.key}/videos/premium"
        )
    else:
        url = f"https://www.pornhub.com/{user.user_type}/{user.key}/videos/upload"

    if public:
        if is_premium:
            url = f"https://www.pornhubpremium.com/{user.user_type}/{user.key}/videos/upload"
        else:
            url = f"https://www.pornhub.com/{user.user_type}/{user.key}/videos/public"

    soup = get_soup(url)
    if soup is None:
        logger.info(f"Nothing on {url}")
        return []

    pages = 1
    hasNavigation = False
    hasEndlessScrolling = False

    # Some sites have a navigation at the bottom
    navigation = soup.find("div", {"class": "pagination3"})
    if navigation is not None:
        children = navigation.findChildren("li", {"class": "page_number"})
        pages = len(children) + 1
        hasNavigation = True
    # Others have a button for "endless scrolling"
    # In that case we have to search as long as
    elif soup.find(id="moreDataBtnStream"):
        hasEndlessScrolling = True

    keys = []
    current_page = 1
    next_url = url
    while current_page <= pages:
        # Check if the next site has another "endless scrolling" button as qell
        # If that's the case, increase the counter
        if hasEndlessScrolling and soup.find(id="moreDataBtnStream"):
            pages += 1

        logger.info(f"Crawling {next_url}")
        videoSection = soup.find("div", {"class": "videoUList"})
        pornstarVideoSection = soup.find(id="pornstarsVideoSection")
        claimedUploadedVideoSection = soup.find(
            id="claimedUploadedVideoSection")

        # Users with normal video upload list
        if videoSection is not None:
            videos = videoSection.find(id="moreData")
        # Users with pornstarVideoSection
        elif pornstarVideoSection is not None:
            videos = pornstarVideoSection
        # Dunno what this is
        elif claimedUploadedVideoSection is not None:
            videos = claimedUploadedVideoSection
        else:
            logger.error(
                f"Couldn't find video section on {next_url}. Did we log out?")
            if check_logged_out(soup):
                sys.exit(1)
            return []

        for video in videos.find_all("li"):
            if video.has_attr("data-video-vkey"):
                keys.append(video["data-video-vkey"])

        current_page += 1
        next_url = url + f"?page={current_page}"

        time.sleep(4)

        soup = get_soup(next_url)
        # We couldn't get the next url.
        if soup is None:
            break

    return keys
Beispiel #13
0
def download_video(viewkey, name="single_videos"):
    """Download the video."""
    # Decide which domain should be used, depending if the user has a premium account
    is_premium = os.path.exists("premium")
    if is_premium:
        video_url = f"https://www.pornhubpremium.com/view_video.php?viewkey={viewkey}"
    else:
        video_url = f"https://www.pornhub.com/view_video.php?viewkey={viewkey}"

    options = {
        "outtmpl":
        f"/data/Media/P**n/{name}/%(title)s.%(ext)s",
        "format":
        "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
        "merge-output-format":
        "mp4",
        "quiet":
        True,
        "retries":
        3,
        "nooverwrites":
        False,
        "continuedl":
        True,
        "yes-playlist":
        True,
        "add-metadata":
        True,
        "external-downloader":
        "aria2c",
        "external-downloader-args":
        "--no-conf --file-allocation=none -x16 -s16 -j5 -k5M -c -R",
    }
    if is_premium:
        options["cookiefile"] = "cookie_file"

    ydl = youtube_dl.YoutubeDL(options)
    tries = 0
    while True:
        try:
            logger.info(f"Start downloading: {video_url}")
            info = ydl.extract_info(video_url)
            info[
                "out_path"] = f'/data/Media/P**n/{name}/{info["title"]}.{info["ext"]}'
            return True, info
        except TypeError:
            # This is an error that seems to occurr from time to time
            # A short wait and retry often seems to fix the problem
            # This is something about pornhub not properly loading the video.
            logger.info("Got TypeError bug")
            time.sleep(20)
            tries += 1

            # If this happens too many times, something else must be broken.
            if tries > 10:
                return False, None
            continue
        except DownloadError:
            # We got a download error.
            # Ignore for now and continue downloading the other videos
            logger.error(
                f"DownloadError: Failed to download video: {viewkey}.")
            return False, None

        time.sleep(6)
    return False, None