def extract_screen_name_from_twitter_url(url): """ Function returning the screen_name from a given Twitter url. Args: url (str) : Url from which we extract the screen_name if found. Returns: str : screen_name if the url is a valid twitter url, None otherwise. """ # Checking whether the url is a valid twitter url if not is_twitter_url(url): return None parsed = safe_urlsplit(url) path = urlpathsplit(parsed.path) if path: return normalize_screen_name(path[0]) if parsed.fragment.startswith('!'): path = re.sub(TWITTER_FRAGMENT_ROUTING_RE, '', parsed.fragment) return normalize_screen_name(path) return None
def parse_google_drive_url(url): splitted = safe_urlsplit(url) if 'docs.google.com' not in splitted.netloc: return None path = urlpathsplit(splitted.path) if len(path) < 3: return None drive_type = path[0] if drive_type not in DRIVE_TYPES: return None if path[1] != 'd': return None if path[-1] == 'pub': if path[2] != 'e': return None return GoogleDrivePublicLink(drive_type, path[3]) return GoogleDriveFile(drive_type, path[2])
def parse_facebook_url(url, allow_relative_urls=False): # Allowing relative urls scraped from facebook? if (allow_relative_urls and not url.startswith('http://') and not url.startswith('https://') and 'facebook.' not in url): url = urljoin(BASE_FACEBOOK_URL, url) else: if not is_facebook_url(url): return None splitted = safe_urlsplit(url) if not splitted.path or splitted.path == '/': return None # Obvious post path if '/posts/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] if NUMERIC_ID_RE.match(parent_id_or_handle): return FacebookPost(parts[2], parent_id=parent_id_or_handle) return FacebookPost(parts[2], parent_handle=parent_id_or_handle) # Ye olded permalink path if splitted.query and '/permalink.php' in splitted.path: query = parse_qs(splitted.query) return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0]) # Group permalink path if '/groups/' in splitted.path and '/permalink/' in splitted.path: parts = urlpathsplit(splitted.path) return FacebookPost(parts[3], group_id=parts[1]) # Profile path if splitted.path == '/profile.php': query = parse_qs(splitted.query) user_id = query['id'][0] return FacebookUser(user_id) # People path if splitted.path.startswith('/people'): parts = urlpathsplit(splitted.path) user_id = parts[2] return FacebookUser(user_id) # Handle path if splitted.path: parts = urlpathsplit(splitted.path) if not parts[0].endswith('.php'): return FacebookHandle(parts[0]) return None
def match(self, url): url = safe_urlsplit(url) if not url.hostname: return False key = tokenize_hostname(url.hostname) return bool(self.longest(key))
def is_facebook_link(url): splitted = safe_urlsplit(url) if not splitted.hostname or '.facebook.' not in splitted.hostname: return False if splitted.path != '/l.php': return False return True
def is_google_link(url): splitted = safe_urlsplit(url) if not splitted.hostname or 'google.' not in splitted.hostname: return False if splitted.path != '/url': return False return True
def extract_id_from_google_drive_url(url): splitted = safe_urlsplit(url) if 'docs.google.com' not in splitted.netloc: return None path = urlpathsplit(splitted.path) if len(path) < 3: return None if path[0] not in DRIVE_TYPES: return None if path[1] != 'd': return None return path[2]
def is_amp_url(url): splitted = safe_urlsplit(url) if splitted.hostname.endswith('.ampproject.org'): return True if splitted.hostname.startswith('amp-'): return True if splitted.hostname.startswith('amp.'): return True if '/amp/' in splitted.path: return True if AMP_SUFFIXES_RE.search(splitted.path): return True if splitted.query and AMP_QUERY_RE.search(splitted.query): return True return False
def parse_facebook_url(url, allow_relative_urls=False): # Allowing relative urls scraped from facebook? if (allow_relative_urls and not url.startswith('http://') and not url.startswith('https://') and 'facebook.' not in url): url = urljoin(BASE_FACEBOOK_URL, url) else: if not is_facebook_url(url): return None splitted = safe_urlsplit(url) if not splitted.path or splitted.path == '/': return None # Videos if '/watch' in splitted.path: query = parse_qs(splitted.query) if 'v' not in query: return None video_id = query['v'][0] return FacebookVideo(video_id) if '/videos/' in splitted.path: parts = urlpathsplit(splitted.path) return FacebookVideo(parts[2], parent_id=parts[0]) # Photos if splitted.query and (splitted.path.endswith('/photo.php') or splitted.path.endswith('/photo')): query = parse_qs(splitted.query) if 'fbid' not in query: return None group_id = None album_id = None if 'set' in query: sets = query['set'] group_id = next((s for s in sets if s.startswith('g.')), None) if group_id: group_id = group_id.split('g.', 1)[1] album_id = next((s for s in sets if s.startswith('a.')), None) if album_id: album_id = album_id.split('a.', 1)[1] return FacebookPhoto(query['fbid'][0], group_id=group_id, album_id=album_id) if '/photos/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] album_id = parts[2].replace('a.', '') photo_id = parts[3] if is_facebook_id(parent_id_or_handle): return FacebookPhoto(photo_id, album_id=album_id, parent_id=parent_id_or_handle) return FacebookPhoto(photo_id, album_id=album_id, parent_handle=parent_id_or_handle) # Obvious post path if '/posts/' in splitted.path: parts = urlpathsplit(splitted.path) parent_id_or_handle = parts[0] if NUMERIC_ID_RE.match(parent_id_or_handle): return FacebookPost(parts[2], parent_id=parent_id_or_handle) return FacebookPost(parts[2], parent_handle=parent_id_or_handle) # Ye olded permalink path if splitted.query and ('/permalink.php' in splitted.path or '/story.php' in splitted.path): query = parse_qs(splitted.query) return FacebookPost(query['story_fbid'][0], parent_id=query['id'][0]) # Group permalink path if '/groups/' in splitted.path: parts = urlpathsplit(splitted.path) if '/permalink/' in splitted.path: if is_facebook_id(parts[1]): return FacebookPost(parts[3], group_id=parts[1]) return FacebookPost(parts[3], group_handle=parts[1]) if is_facebook_id(parts[1]): return FacebookGroup(id=parts[1]) return FacebookGroup(handle=parts[1]) # Profile path if splitted.path == '/profile.php': query = parse_qs(splitted.query) user_id = query['id'][0] return FacebookUser(user_id) # People path if splitted.path.startswith('/people'): parts = urlpathsplit(splitted.path) user_id = parts[2] return FacebookUser(user_id) # Handle path if splitted.path: parts = urlpathsplit(splitted.path) if not parts[0].endswith('.php'): return FacebookHandle(parts[0]) return None