def url(self): if self.parent_handle is not None: return urljoin(BASE_FACEBOOK_URL, '/%s/posts/%s' % (self.parent_handle, self.id)) if self.parent_id is not None: return urljoin( BASE_FACEBOOK_URL, '/permalink.php?story_fbid=%s&id=%s' % (self.id, self.parent_id)) if self.group_id is not None: return urljoin( BASE_FACEBOOK_URL, '/groups/%s/permalink/%s' % (self.group_id, self.id))
def infer_redirection(url): """ Function returning the url that the given url will redirect to. This is done by finding obvious hints in the GET parameters that the given url is in fact a redirection. Args: url (string): Target url. Returns: string: Redirected url or the original url if nothing was found. """ redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1) if len(redirection_split) > 1: return infer_redirection('https://' + redirection_split[1]) obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url) if obvious_redirect_match is not None: target = unquote(obvious_redirect_match.group(1)) if target.startswith('http://') or target.startswith('https://'): return target if target.startswith('/'): return urljoin(url, target) return url
def url(self): if self.group_id: return urljoin( BASE_FACEBOOK_URL, '/photo.php?fbid=%s&set=g.%s' % (self.id, self.group_id)) if self.parent_id: return urljoin( BASE_FACEBOOK_URL, '/%s/a.%s/%s' % (self.parent_id, self.album_id, self.id)) if self.parent_handle: return urljoin( BASE_FACEBOOK_URL, '/%s/a.%s/%s' % (self.parent_handle, self.album_id, self.id)) return urljoin(BASE_FACEBOOK_URL, '/photo.php?fbid=%s' % self.id)
def parse_facebook_url(url, allow_relative_urls=False): # Allowing relative urls scraped from facebook? if (allow_relative_urls and not url.startswith('http://') and not url.startswith('https://') and 'facebook.' not in url): url = urljoin(BASE_FACEBOOK_URL, url) else: if not is_facebook_url(url): return None splitted = safe_urlsplit(url) if not splitted.path or splitted.path == '/': return None # Obvious post path if '/posts/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] if NUMERIC_ID_RE.match(parent_id_or_handle): return FacebookPost(parts[2], parent_id=parent_id_or_handle) return FacebookPost(parts[2], parent_handle=parent_id_or_handle) # Ye olded permalink path if splitted.query and '/permalink.php' in splitted.path: query = parse_qs(splitted.query) return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0]) # Group permalink path if '/groups/' in splitted.path and '/permalink/' in splitted.path: parts = urlpathsplit(splitted.path) return FacebookPost(parts[3], group_id=parts[1]) # Profile path if splitted.path == '/profile.php': query = parse_qs(splitted.query) user_id = query['id'][0] return FacebookUser(user_id) # People path if splitted.path.startswith('/people'): parts = urlpathsplit(splitted.path) user_id = parts[2] return FacebookUser(user_id) # Handle path if splitted.path: parts = urlpathsplit(splitted.path) if not parts[0].endswith('.php'): return FacebookHandle(parts[0]) return None
def infer_redirection(url, recursive=True): """ Function returning the url that the given url will redirect to. This is done by finding obvious hints in the GET parameters that the given url is in fact a redirection. Args: url (string): Target url. recursive (bool): Whether to apply the function recursively until no redirection can be inferred. Defaults to `True`. Returns: string: Redirected url or the original url if nothing was found. """ redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1) target = None if len(redirection_split) > 1: target = 'https://' + redirection_split[1] else: obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url) if obvious_redirect_match is not None: potential_target = unquote(obvious_redirect_match.group(1)) if potential_target.startswith( 'http://') or potential_target.startswith('https://'): target = potential_target if potential_target.startswith('/'): target = urljoin(url, potential_target) if target is None: return url if recursive: return infer_redirection(target, recursive=True) return target
def url(self): return urljoin(BASE_FACEBOOK_URL, '/%s' % self.handle)
def url(self): if self.handle is None: return urljoin(BASE_FACEBOOK_URL, '/profile.php?id=%s' % self.id) return urljoin(BASE_FACEBOOK_URL, '/%s' % self.handle)
def parse_facebook_url(url, allow_relative_urls=False): # Allowing relative urls scraped from facebook? if (allow_relative_urls and not url.startswith('http://') and not url.startswith('https://') and 'facebook.' not in url): url = urljoin(BASE_FACEBOOK_URL, url) else: if not is_facebook_url(url): return None splitted = safe_urlsplit(url) if not splitted.path or splitted.path == '/': return None # Videos if '/watch' in splitted.path: query = parse_qs(splitted.query) if 'v' not in query: return None video_id = query['v'][0] return FacebookVideo(video_id) if '/videos/' in splitted.path: parts = urlpathsplit(splitted.path) return FacebookVideo(parts[2], parent_id=parts[0]) # Photos if splitted.query and (splitted.path.endswith('/photo.php') or splitted.path.endswith('/photo')): query = parse_qs(splitted.query) if 'fbid' not in query: return None group_id = None album_id = None if 'set' in query: sets = query['set'] group_id = next((s for s in sets if s.startswith('g.')), None) if group_id: group_id = group_id.split('g.', 1)[1] album_id = next((s for s in sets if s.startswith('a.')), None) if album_id: album_id = album_id.split('a.', 1)[1] return FacebookPhoto(query['fbid'][0], group_id=group_id, album_id=album_id) if '/photos/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] album_id = parts[2].replace('a.', '') photo_id = parts[3] if is_facebook_id(parent_id_or_handle): return FacebookPhoto(photo_id, album_id=album_id, parent_id=parent_id_or_handle) return FacebookPhoto(photo_id, album_id=album_id, parent_handle=parent_id_or_handle) # Obvious post path if '/posts/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] if NUMERIC_ID_RE.match(parent_id_or_handle): return FacebookPost(parts[2], parent_id=parent_id_or_handle) return FacebookPost(parts[2], parent_handle=parent_id_or_handle) # Ye olded permalink path if splitted.query and ('/permalink.php' in splitted.path or '/story.php' in splitted.path): query = parse_qs(splitted.query) return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0]) # Group permalink path if '/groups/' in splitted.path: parts = urlpathsplit(splitted.path) if '/permalink/' in splitted.path: if is_facebook_id(parts[1]): return FacebookPost(parts[3], group_id=parts[1]) return FacebookPost(parts[3], group_handle=parts[1]) if is_facebook_id(parts[1]): return FacebookGroup(id=parts[1]) return FacebookGroup(handle=parts[1]) # Profile path if splitted.path == '/profile.php': query = parse_qs(splitted.query) user_id = query['id'][0] return FacebookUser(user_id) # People path if splitted.path.startswith('/people'): parts = urlpathsplit(splitted.path) user_id = parts[2] return FacebookUser(user_id) # Handle path if splitted.path: parts = urlpathsplit(splitted.path) if not parts[0].endswith('.php'): return FacebookHandle(parts[0]) return None
def url(self): if self.parent_id is None: return urljoin(BASE_FACEBOOK_URL, '/watch/?v=%s' % self.id) return urljoin(BASE_FACEBOOK_URL, '/%s/videos/%s' % (self.parent_id, self.id))
def url(self): if self.handle is not None: return urljoin(BASE_FACEBOOK_URL, 'groups/%s' % self.handle) return urljoin(BASE_FACEBOOK_URL, 'groups/%s' % self.id)