def parse_facebook_url(url, allow_relative_urls=False): # Allowing relative urls scraped from facebook? if (allow_relative_urls and not url.startswith('http://') and not url.startswith('https://') and 'facebook.' not in url): url = urljoin(BASE_FACEBOOK_URL, url) else: if not is_facebook_url(url): return None splitted = safe_urlsplit(url) if not splitted.path or splitted.path == '/': return None # Obvious post path if '/posts/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] if NUMERIC_ID_RE.match(parent_id_or_handle): return FacebookPost(parts[2], parent_id=parent_id_or_handle) return FacebookPost(parts[2], parent_handle=parent_id_or_handle) # Ye olded permalink path if splitted.query and '/permalink.php' in splitted.path: query = parse_qs(splitted.query) return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0]) # Group permalink path if '/groups/' in splitted.path and '/permalink/' in splitted.path: parts = urlpathsplit(splitted.path) return FacebookPost(parts[3], group_id=parts[1]) # Profile path if splitted.path == '/profile.php': query = parse_qs(splitted.query) user_id = query['id'][0] return FacebookUser(user_id) # People path if splitted.path.startswith('/people'): parts = urlpathsplit(splitted.path) user_id = parts[2] return FacebookUser(user_id) # Handle path if splitted.path: parts = urlpathsplit(splitted.path) if not parts[0].endswith('.php'): return FacebookHandle(parts[0]) return None
def extract_screen_name_from_twitter_url(url): """ Function returning the screen_name from a given Twitter url. Args: url (str) : Url from which we extract the screen_name if found. Returns: str : screen_name if the url is a valid twitter url, None otherwise. """ # Checking whether the url is a valid twitter url if not is_twitter_url(url): return None parsed = safe_urlsplit(url) path = urlpathsplit(parsed.path) if path: return normalize_screen_name(path[0]) if parsed.fragment.startswith('!'): path = re.sub(TWITTER_FRAGMENT_ROUTING_RE, '', parsed.fragment) return normalize_screen_name(path) return None
def parse_google_drive_url(url): splitted = safe_urlsplit(url) if 'docs.google.com' not in splitted.netloc: return None path = urlpathsplit(splitted.path) if len(path) < 3: return None drive_type = path[0] if drive_type not in DRIVE_TYPES: return None if path[1] != 'd': return None if path[-1] == 'pub': if path[2] != 'e': return None return GoogleDrivePublicLink(drive_type, path[3]) return GoogleDriveFile(drive_type, path[2])
def extract_id_from_google_drive_url(url): splitted = safe_urlsplit(url) if 'docs.google.com' not in splitted.netloc: return None path = urlpathsplit(splitted.path) if len(path) < 3: return None if path[0] not in DRIVE_TYPES: return None if path[1] != 'd': return None return path[2]
def parse_youtube_url(url, fix_common_mistakes=True): """ Function parsing the given url and returning either a YoutubeUser, YoutubeChannel, YoutubeVideo or None if nothing of information could be found. Args: url (str): Url to parse. fix_common_mistakes (bool, optional): Whether to fix common mistakes in Youtube urls as you can find them on the web. Defaults to `True`. """ # Inferring redirection url = infer_redirection(url) # Continuation urls m = NEXT_V_RE.search(url) or NESTED_NEXT_V_RE.search(url) if m: return YoutubeVideo(id=m.group(1)) # Parsing if isinstance(url, SplitResult): parsed = url else: url = ensure_protocol(url) parsed = urlsplit(url) if not is_youtube_url(parsed): return _, _, path, query, fragment = parsed # youtu.be if parsed.hostname.endswith('youtu.be'): if path.count('/') > 0: v = urlpathsplit(path)[0] if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) return # Hidden video in fragment if fragment: mv = FRAGMENT_V_RE.match(fragment) if mv: v = mv.group(1) if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Typical video url if path == '/watch': mv = QUERY_V_RE.search(query) if mv: v = mv.group(1) if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Video file elif (path.startswith('/v/') or path.startswith('/video/') or path.startswith('/embed/')): v = urlpathsplit(path)[-1] if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Typical user url elif path.startswith('/user/'): user = urlpathsplit(path)[1] return YoutubeUser(id=None, name=user) # Channel path? elif path.startswith('/c/'): name = urlpathsplit(path)[1] return YoutubeChannel(id=None, name=name) elif path.startswith('/channel/'): cid = urlpathsplit(path)[1] return YoutubeChannel(id=cid, name=None) else: path = path.rstrip('/') if path.count('/') == 1: return YoutubeChannel(id=None, name=path.lstrip('/'))
def parse_facebook_url(url, allow_relative_urls=False): # Allowing relative urls scraped from facebook? if (allow_relative_urls and not url.startswith('http://') and not url.startswith('https://') and 'facebook.' not in url): url = urljoin(BASE_FACEBOOK_URL, url) else: if not is_facebook_url(url): return None splitted = safe_urlsplit(url) if not splitted.path or splitted.path == '/': return None # Videos if '/watch' in splitted.path: query = parse_qs(splitted.query) if 'v' not in query: return None video_id = query['v'][0] return FacebookVideo(video_id) if '/videos/' in splitted.path: parts = urlpathsplit(splitted.path) return FacebookVideo(parts[2], parent_id=parts[0]) # Photos if splitted.query and (splitted.path.endswith('/photo.php') or splitted.path.endswith('/photo')): query = parse_qs(splitted.query) if 'fbid' not in query: return None group_id = None album_id = None if 'set' in query: sets = query['set'] group_id = next((s for s in sets if s.startswith('g.')), None) if group_id: group_id = group_id.split('g.', 1)[1] album_id = next((s for s in sets if s.startswith('a.')), None) if album_id: album_id = album_id.split('a.', 1)[1] return FacebookPhoto(query['fbid'][0], group_id=group_id, album_id=album_id) if '/photos/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] album_id = parts[2].replace('a.', '') photo_id = parts[3] if is_facebook_id(parent_id_or_handle): return FacebookPhoto(photo_id, album_id=album_id, parent_id=parent_id_or_handle) return FacebookPhoto(photo_id, album_id=album_id, parent_handle=parent_id_or_handle) # Obvious post path if '/posts/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] if NUMERIC_ID_RE.match(parent_id_or_handle): return FacebookPost(parts[2], parent_id=parent_id_or_handle) return FacebookPost(parts[2], parent_handle=parent_id_or_handle) # Ye olded permalink path if splitted.query and ('/permalink.php' in splitted.path or '/story.php' in splitted.path): query = parse_qs(splitted.query) return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0]) # Group permalink path if '/groups/' in splitted.path: parts = urlpathsplit(splitted.path) if '/permalink/' in splitted.path: if is_facebook_id(parts[1]): return FacebookPost(parts[3], group_id=parts[1]) return FacebookPost(parts[3], group_handle=parts[1]) if is_facebook_id(parts[1]): return FacebookGroup(id=parts[1]) return FacebookGroup(handle=parts[1]) # Profile path if splitted.path == '/profile.php': query = parse_qs(splitted.query) user_id = query['id'][0] return FacebookUser(user_id) # People path if splitted.path.startswith('/people'): parts = urlpathsplit(splitted.path) user_id = parts[2] return FacebookUser(user_id) # Handle path if splitted.path: parts = urlpathsplit(splitted.path) if not parts[0].endswith('.php'): return FacebookHandle(parts[0]) return None
def test_urlpathsplit(self): for path, result in URLPATHSPLIT_TESTS: assert urlpathsplit(path) == result