Ejemplo n.º 1
0
def parse_facebook_url(url, allow_relative_urls=False):

    # Allowing relative urls scraped from facebook?
    if (allow_relative_urls and not url.startswith('http://')
            and not url.startswith('https://') and 'facebook.' not in url):
        url = urljoin(BASE_FACEBOOK_URL, url)
    else:
        if not is_facebook_url(url):
            return None

    splitted = safe_urlsplit(url)

    if not splitted.path or splitted.path == '/':
        return None

    # Obvious post path
    if '/posts/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        parent_id_or_handle = parts[0]

        if NUMERIC_ID_RE.match(parent_id_or_handle):
            return FacebookPost(parts[2], parent_id=parent_id_or_handle)

        return FacebookPost(parts[2], parent_handle=parent_id_or_handle)

    # Ye olded permalink path
    if splitted.query and '/permalink.php' in splitted.path:
        query = parse_qs(splitted.query)
        return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0])

    # Group permalink path
    if '/groups/' in splitted.path and '/permalink/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        return FacebookPost(parts[3], group_id=parts[1])

    # Profile path
    if splitted.path == '/profile.php':
        query = parse_qs(splitted.query)
        user_id = query['id'][0]
        return FacebookUser(user_id)

    # People path
    if splitted.path.startswith('/people'):
        parts = urlpathsplit(splitted.path)
        user_id = parts[2]
        return FacebookUser(user_id)

    # Handle path
    if splitted.path:
        parts = urlpathsplit(splitted.path)

        if not parts[0].endswith('.php'):
            return FacebookHandle(parts[0])

    return None
Ejemplo n.º 2
0
def extract_screen_name_from_twitter_url(url):
    """
    Function returning the screen_name from a given Twitter url.

    Args:
        url (str) : Url from which we extract the screen_name if found.

    Returns:
        str : screen_name if the url is a valid twitter url, None otherwise.

    """

    # Checking whether the url is a valid twitter url
    if not is_twitter_url(url):
        return None

    parsed = safe_urlsplit(url)
    path = urlpathsplit(parsed.path)

    if path:
        return normalize_screen_name(path[0])

    if parsed.fragment.startswith('!'):
        path = re.sub(TWITTER_FRAGMENT_ROUTING_RE, '', parsed.fragment)

        return normalize_screen_name(path)

    return None
Ejemplo n.º 3
0
def parse_google_drive_url(url):
    splitted = safe_urlsplit(url)

    if 'docs.google.com' not in splitted.netloc:
        return None

    path = urlpathsplit(splitted.path)

    if len(path) < 3:
        return None

    drive_type = path[0]

    if drive_type not in DRIVE_TYPES:
        return None

    if path[1] != 'd':
        return None

    if path[-1] == 'pub':
        if path[2] != 'e':
            return None

        return GoogleDrivePublicLink(drive_type, path[3])

    return GoogleDriveFile(drive_type, path[2])
Ejemplo n.º 4
0
def extract_id_from_google_drive_url(url):
    splitted = safe_urlsplit(url)

    if 'docs.google.com' not in splitted.netloc:
        return None

    path = urlpathsplit(splitted.path)

    if len(path) < 3:
        return None

    if path[0] not in DRIVE_TYPES:
        return None

    if path[1] != 'd':
        return None

    return path[2]
Ejemplo n.º 5
0
def parse_youtube_url(url, fix_common_mistakes=True):
    """
    Function parsing the given url and returning either a YoutubeUser,
    YoutubeChannel, YoutubeVideo or None if nothing of information could be
    found.

    Args:
        url (str): Url to parse.
        fix_common_mistakes (bool, optional): Whether to fix common mistakes
            in Youtube urls as you can find them on the web. Defaults to `True`.

    """

    # Inferring redirection
    url = infer_redirection(url)

    # Continuation urls
    m = NEXT_V_RE.search(url) or NESTED_NEXT_V_RE.search(url)

    if m:
        return YoutubeVideo(id=m.group(1))

    # Parsing
    if isinstance(url, SplitResult):
        parsed = url
    else:
        url = ensure_protocol(url)
        parsed = urlsplit(url)

    if not is_youtube_url(parsed):
        return

    _, _, path, query, fragment = parsed

    # youtu.be
    if parsed.hostname.endswith('youtu.be'):

        if path.count('/') > 0:
            v = urlpathsplit(path)[0]

            if fix_common_mistakes:
                v = v[:11]

            if not is_youtube_video_id(v):
                return

            return YoutubeVideo(id=v)

        return

    # Hidden video in fragment
    if fragment:
        mv = FRAGMENT_V_RE.match(fragment)

        if mv:
            v = mv.group(1)

            if not is_youtube_video_id(v):
                return

            return YoutubeVideo(id=v)

    # Typical video url
    if path == '/watch':
        mv = QUERY_V_RE.search(query)

        if mv:
            v = mv.group(1)

            if fix_common_mistakes:
                v = v[:11]

            if not is_youtube_video_id(v):
                return

            return YoutubeVideo(id=v)

    # Video file
    elif (path.startswith('/v/') or path.startswith('/video/')
          or path.startswith('/embed/')):
        v = urlpathsplit(path)[-1]

        if fix_common_mistakes:
            v = v[:11]

        if not is_youtube_video_id(v):
            return

        return YoutubeVideo(id=v)

    # Typical user url
    elif path.startswith('/user/'):
        user = urlpathsplit(path)[1]

        return YoutubeUser(id=None, name=user)

    # Channel path?
    elif path.startswith('/c/'):
        name = urlpathsplit(path)[1]

        return YoutubeChannel(id=None, name=name)

    elif path.startswith('/channel/'):
        cid = urlpathsplit(path)[1]

        return YoutubeChannel(id=cid, name=None)

    else:
        path = path.rstrip('/')
        if path.count('/') == 1:
            return YoutubeChannel(id=None, name=path.lstrip('/'))
Ejemplo n.º 6
0
def parse_facebook_url(url, allow_relative_urls=False):

    # Allowing relative urls scraped from facebook?
    if (allow_relative_urls and not url.startswith('http://')
            and not url.startswith('https://') and 'facebook.' not in url):
        url = urljoin(BASE_FACEBOOK_URL, url)
    else:
        if not is_facebook_url(url):
            return None

    splitted = safe_urlsplit(url)

    if not splitted.path or splitted.path == '/':
        return None

    # Videos
    if '/watch' in splitted.path:
        query = parse_qs(splitted.query)

        if 'v' not in query:
            return None

        video_id = query['v'][0]

        return FacebookVideo(video_id)

    if '/videos/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        return FacebookVideo(parts[2], parent_id=parts[0])

    # Photos
    if splitted.query and (splitted.path.endswith('/photo.php')
                           or splitted.path.endswith('/photo')):
        query = parse_qs(splitted.query)

        if 'fbid' not in query:
            return None

        group_id = None
        album_id = None

        if 'set' in query:
            sets = query['set']

            group_id = next((s for s in sets if s.startswith('g.')), None)

            if group_id:
                group_id = group_id.split('g.', 1)[1]

            album_id = next((s for s in sets if s.startswith('a.')), None)

            if album_id:
                album_id = album_id.split('a.', 1)[1]

        return FacebookPhoto(query['fbid'][0],
                             group_id=group_id,
                             album_id=album_id)

    if '/photos/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        parent_id_or_handle = parts[0]
        album_id = parts[2].replace('a.', '')
        photo_id = parts[3]

        if is_facebook_id(parent_id_or_handle):
            return FacebookPhoto(photo_id,
                                 album_id=album_id,
                                 parent_id=parent_id_or_handle)

        return FacebookPhoto(photo_id,
                             album_id=album_id,
                             parent_handle=parent_id_or_handle)

    # Obvious post path
    if '/posts/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        parent_id_or_handle = parts[0]

        if NUMERIC_ID_RE.match(parent_id_or_handle):
            return FacebookPost(parts[2], parent_id=parent_id_or_handle)

        return FacebookPost(parts[2], parent_handle=parent_id_or_handle)

    # Ye olded permalink path
    if splitted.query and ('/permalink.php' in splitted.path
                           or '/story.php' in splitted.path):
        query = parse_qs(splitted.query)
        return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0])

    # Group permalink path
    if '/groups/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        if '/permalink/' in splitted.path:
            if is_facebook_id(parts[1]):
                return FacebookPost(parts[3], group_id=parts[1])

            return FacebookPost(parts[3], group_handle=parts[1])

        if is_facebook_id(parts[1]):
            return FacebookGroup(id=parts[1])

        return FacebookGroup(handle=parts[1])

    # Profile path
    if splitted.path == '/profile.php':
        query = parse_qs(splitted.query)
        user_id = query['id'][0]
        return FacebookUser(user_id)

    # People path
    if splitted.path.startswith('/people'):
        parts = urlpathsplit(splitted.path)
        user_id = parts[2]
        return FacebookUser(user_id)

    # Handle path
    if splitted.path:
        parts = urlpathsplit(splitted.path)

        if not parts[0].endswith('.php'):
            return FacebookHandle(parts[0])

    return None
Ejemplo n.º 7
0
 def test_urlpathsplit(self):
     for path, result in URLPATHSPLIT_TESTS:
         assert urlpathsplit(path) == result