Ejemplo n.º 1
0
    def url(self):
        if self.parent_handle is not None:
            return urljoin(BASE_FACEBOOK_URL,
                           '/%s/posts/%s' % (self.parent_handle, self.id))

        if self.parent_id is not None:
            return urljoin(
                BASE_FACEBOOK_URL, '/permalink.php?story_fbid=%s&id=%s' %
                (self.id, self.parent_id))

        if self.group_id is not None:
            return urljoin(
                BASE_FACEBOOK_URL,
                '/groups/%s/permalink/%s' % (self.group_id, self.id))
Ejemplo n.º 2
0
def infer_redirection(url):
    """
    Function returning the url that the given url will redirect to. This is done
    by finding obvious hints in the GET parameters that the given url is in
    fact a redirection.

    Args:
        url (string): Target url.

    Returns:
        string: Redirected url or the original url if nothing was found.
    """

    redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1)

    if len(redirection_split) > 1:
        return infer_redirection('https://' + redirection_split[1])

    obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url)

    if obvious_redirect_match is not None:
        target = unquote(obvious_redirect_match.group(1))

        if target.startswith('http://') or target.startswith('https://'):
            return target

        if target.startswith('/'):
            return urljoin(url, target)

    return url
Ejemplo n.º 3
0
    def url(self):
        if self.group_id:
            return urljoin(
                BASE_FACEBOOK_URL,
                '/photo.php?fbid=%s&set=g.%s' % (self.id, self.group_id))

        if self.parent_id:
            return urljoin(
                BASE_FACEBOOK_URL,
                '/%s/a.%s/%s' % (self.parent_id, self.album_id, self.id))

        if self.parent_handle:
            return urljoin(
                BASE_FACEBOOK_URL,
                '/%s/a.%s/%s' % (self.parent_handle, self.album_id, self.id))

        return urljoin(BASE_FACEBOOK_URL, '/photo.php?fbid=%s' % self.id)
Ejemplo n.º 4
0
def parse_facebook_url(url, allow_relative_urls=False):

    # Allowing relative urls scraped from facebook?
    if (allow_relative_urls and not url.startswith('http://')
            and not url.startswith('https://') and 'facebook.' not in url):
        url = urljoin(BASE_FACEBOOK_URL, url)
    else:
        if not is_facebook_url(url):
            return None

    splitted = safe_urlsplit(url)

    if not splitted.path or splitted.path == '/':
        return None

    # Obvious post path
    if '/posts/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        parent_id_or_handle = parts[0]

        if NUMERIC_ID_RE.match(parent_id_or_handle):
            return FacebookPost(parts[2], parent_id=parent_id_or_handle)

        return FacebookPost(parts[2], parent_handle=parent_id_or_handle)

    # Ye olded permalink path
    if splitted.query and '/permalink.php' in splitted.path:
        query = parse_qs(splitted.query)
        return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0])

    # Group permalink path
    if '/groups/' in splitted.path and '/permalink/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        return FacebookPost(parts[3], group_id=parts[1])

    # Profile path
    if splitted.path == '/profile.php':
        query = parse_qs(splitted.query)
        user_id = query['id'][0]
        return FacebookUser(user_id)

    # People path
    if splitted.path.startswith('/people'):
        parts = urlpathsplit(splitted.path)
        user_id = parts[2]
        return FacebookUser(user_id)

    # Handle path
    if splitted.path:
        parts = urlpathsplit(splitted.path)

        if not parts[0].endswith('.php'):
            return FacebookHandle(parts[0])

    return None
Ejemplo n.º 5
0
def infer_redirection(url, recursive=True):
    """
    Function returning the url that the given url will redirect to. This is done
    by finding obvious hints in the GET parameters that the given url is in
    fact a redirection.

    Args:
        url (string): Target url.
        recursive (bool): Whether to apply the function recursively until
            no redirection can be inferred. Defaults to `True`.

    Returns:
        string: Redirected url or the original url if nothing was found.
    """

    redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1)

    target = None

    if len(redirection_split) > 1:
        target = 'https://' + redirection_split[1]

    else:
        obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url)

        if obvious_redirect_match is not None:
            potential_target = unquote(obvious_redirect_match.group(1))

            if potential_target.startswith(
                    'http://') or potential_target.startswith('https://'):
                target = potential_target

            if potential_target.startswith('/'):
                target = urljoin(url, potential_target)

    if target is None:
        return url

    if recursive:
        return infer_redirection(target, recursive=True)

    return target
Ejemplo n.º 6
0
 def url(self):
     return urljoin(BASE_FACEBOOK_URL, '/%s' % self.handle)
Ejemplo n.º 7
0
    def url(self):
        if self.handle is None:
            return urljoin(BASE_FACEBOOK_URL, '/profile.php?id=%s' % self.id)

        return urljoin(BASE_FACEBOOK_URL, '/%s' % self.handle)
Ejemplo n.º 8
0
def parse_facebook_url(url, allow_relative_urls=False):

    # Allowing relative urls scraped from facebook?
    if (allow_relative_urls and not url.startswith('http://')
            and not url.startswith('https://') and 'facebook.' not in url):
        url = urljoin(BASE_FACEBOOK_URL, url)
    else:
        if not is_facebook_url(url):
            return None

    splitted = safe_urlsplit(url)

    if not splitted.path or splitted.path == '/':
        return None

    # Videos
    if '/watch' in splitted.path:
        query = parse_qs(splitted.query)

        if 'v' not in query:
            return None

        video_id = query['v'][0]

        return FacebookVideo(video_id)

    if '/videos/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        return FacebookVideo(parts[2], parent_id=parts[0])

    # Photos
    if splitted.query and (splitted.path.endswith('/photo.php')
                           or splitted.path.endswith('/photo')):
        query = parse_qs(splitted.query)

        if 'fbid' not in query:
            return None

        group_id = None
        album_id = None

        if 'set' in query:
            sets = query['set']

            group_id = next((s for s in sets if s.startswith('g.')), None)

            if group_id:
                group_id = group_id.split('g.', 1)[1]

            album_id = next((s for s in sets if s.startswith('a.')), None)

            if album_id:
                album_id = album_id.split('a.', 1)[1]

        return FacebookPhoto(query['fbid'][0],
                             group_id=group_id,
                             album_id=album_id)

    if '/photos/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        parent_id_or_handle = parts[0]
        album_id = parts[2].replace('a.', '')
        photo_id = parts[3]

        if is_facebook_id(parent_id_or_handle):
            return FacebookPhoto(photo_id,
                                 album_id=album_id,
                                 parent_id=parent_id_or_handle)

        return FacebookPhoto(photo_id,
                             album_id=album_id,
                             parent_handle=parent_id_or_handle)

    # Obvious post path
    if '/posts/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        parent_id_or_handle = parts[0]

        if NUMERIC_ID_RE.match(parent_id_or_handle):
            return FacebookPost(parts[2], parent_id=parent_id_or_handle)

        return FacebookPost(parts[2], parent_handle=parent_id_or_handle)

    # Ye olded permalink path
    if splitted.query and ('/permalink.php' in splitted.path
                           or '/story.php' in splitted.path):
        query = parse_qs(splitted.query)
        return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0])

    # Group permalink path
    if '/groups/' in splitted.path:
        parts = urlpathsplit(splitted.path)

        if '/permalink/' in splitted.path:
            if is_facebook_id(parts[1]):
                return FacebookPost(parts[3], group_id=parts[1])

            return FacebookPost(parts[3], group_handle=parts[1])

        if is_facebook_id(parts[1]):
            return FacebookGroup(id=parts[1])

        return FacebookGroup(handle=parts[1])

    # Profile path
    if splitted.path == '/profile.php':
        query = parse_qs(splitted.query)
        user_id = query['id'][0]
        return FacebookUser(user_id)

    # People path
    if splitted.path.startswith('/people'):
        parts = urlpathsplit(splitted.path)
        user_id = parts[2]
        return FacebookUser(user_id)

    # Handle path
    if splitted.path:
        parts = urlpathsplit(splitted.path)

        if not parts[0].endswith('.php'):
            return FacebookHandle(parts[0])

    return None
Ejemplo n.º 9
0
    def url(self):
        if self.parent_id is None:
            return urljoin(BASE_FACEBOOK_URL, '/watch/?v=%s' % self.id)

        return urljoin(BASE_FACEBOOK_URL,
                       '/%s/videos/%s' % (self.parent_id, self.id))
Ejemplo n.º 10
0
    def url(self):
        if self.handle is not None:
            return urljoin(BASE_FACEBOOK_URL, 'groups/%s' % self.handle)

        return urljoin(BASE_FACEBOOK_URL, 'groups/%s' % self.id)