def __get_item_content(article: ResultSet): content = article.find('div', class_="text-container") if content is None: content = article.find('p', class_=None, recursive=True) if content is not None: return content.getText().strip() else: return None
def __get_item_author(article: ResultSet): author = article.find(class_="author-block__info", recursive=True) if author is None: author = article.find(class_="story-block__byline", recursive=True) if author is not None: return author.getText().strip() else: return None
def __get_item_content(article: ResultSet): content = article.find('p', class_="story-block__standfirst", recursive=True) if content is None: content = article.find('p', class_="standfirst-content", recursive=True) if content is None: content = article.find('p', class_=None) if content is not None: return content.getText().strip() else: return None
def __init__(self, user_keys: dict, mobile=False, config=None): if config is None: config = {} self.near = config['near'] if 'near' in config else '' self.dark = config['dark'] if 'dark' in config else False self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False self.mobile = mobile self.user_keys = user_keys self.main_divs = ResultSet('') self._elements = 0
def __get_item_url(article: ResultSet): headline_elem = AbcNewsItemParser.__get_headline_elem(article) # Not all articles use a common headline element, so get first link if no dedicated headline encountered if headline_elem is None: return article.find('a')['href'] else: return headline_elem.find('a')['href']
def get_price(self, listing: element.ResultSet) -> str: """Gets a price of an apartment from a "li" element passed in by finding the first "div" element. :param listing: "li" element for one apartment :type listing: element.ResultSet :return: price of an apartment :rtype: str """ price = listing.find("div", class_="price").text return price
def get_listing_link(self, listing: element.ResultSet) -> str: """Gets a link of a listing from a "li" element passed in by finding the first "a" element. :param listing: "li" element for one apartment :type listing: element.ResultSet :return: a link to a listing :rtype: str """ listing_link = listing.find("a")["href"] return listing_link
def get_title(self, listing: element.ResultSet) -> str: """Gets a title of a listing from a "li" element passed in by finding the first "h2" element. :param listing: "li" element for one apartment :type listing: element.ResultSet :return: title of an apartment's listing :rtype: str """ title = listing.find("h2", class_="title-list").text return title
def _get_percentage_util(columns: ResultSet, row: PageElement) -> str: column_number = 0 column: PageElement for column in columns: if str(column.get("contents")) == "Util%": column_number: int = int(columns.index(column)) break siebling = row if column_number > 0: for _ in range(0, column_number - 1): siebling: Any = siebling.findNext("tablecell") return siebling.findNext("tablecell").get("contents")
def crawl_url(url: str) -> ResultSet(Tag): """This method uses Selenium WebDriver to run an automated Chrome Browser and crawl the page. This is required due to the Javascript in the Hackathon website which needs XHR requests to show all events in the year. Args: url: URL to be crawled Returns: BeautifulSoup ResultSet with relevant Page Source to be processed Raises: NoSuchElementException: When it can't find the More button anymore (Only for Debugging) StaleElementReferenceException: Exception raised when the More button is not seen in window """ # Use selenium WebDriver to run an automated Chrome Browser. # This is required due to the Javascript in the Hackathon website which needs XHR requests # to show all events in the year. driver = webdriver.Chrome() driver.get(url) # TODO: Use more efficient method for waiting. time.sleep(2) scroll_down(driver) try: more_button_xpath = "/html/body/div[6]/div[3]/div[3]/a" more_button = driver.find_element_by_xpath(more_button_xpath) except NoSuchElementException: more_button_xpath = "/html/body/div[6]/div[2]/div[3]/a" more_button = driver.find_element_by_xpath(more_button_xpath) while True: scroll_down(driver) # TODO: Use more efficient method for waiting time.sleep(0.7) # TODO: optimize Try-Except try: driver.find_element_by_xpath(more_button_xpath) except NoSuchElementException as e: logging.debug(e) break try: more_button.click() except StaleElementReferenceException as e: logging.error(e) # Parse the read client by creating a BS4 object s_page = BeautifulSoup(driver.page_source, "html.parser") driver.quit() # We find the elements at the right side of the page container = s_page.find_all("div", {"class": ["ht-eb-card__right"]}) # "row ht-idt-card__right__container"]}) return container
def get_attribute(self, listing: element.ResultSet, attribute_name: str) -> str: """ "Gets table row value by searching for corresponding keys which are passsed in as an attribute_name. :param listing: "li" element for one apartment :type listing: element.ResultSet :param attribute_name: a keyword to search for in a li element :type attribute_name: str :return: value that corresponds to a keyword :rtype: str """ sq_meters = listing.find("span", title=attribute_name).text return sq_meters
def _get_product_id(self, product: ResultSet) -> Optional[str]: """Return a product id from product url""" product_url_select = product.select(CSS_SELECTORS['url']) if not product_url_select: return None url = product_url_select[0].get('href') logger.info(f'Url: {url}') if 'slredirect' in url: url = self._request_executor.get_normal_url(url) product_id = url.split("/")[-2] logger.info(f"Get product id: {product_id}") return product_id
def parse_keywords(container: ResultSet(Tag), year: int) -> pd.DataFrame: """This method parses all the keywords that had shown up on the page. Args: year: year of the parsed url for dataframe. container: BeautifulSoup ResultSet with relevant Page Source to be processed Returns: A sorted Dataframe-Object with keywords, the year and its number of occurrences. """ keyword_list = [] # Type: str final_list = [] # Type: Any for tag in container: tag_link_list = tag.find_all("a", {"class": "ht-card-tag"}) for tag_link in tag_link_list: keyword_list.append(tag_link.contents[0]) for k, v in Counter(keyword_list).items(): final_list.append([year, k, v]) data_frame = pd.DataFrame(final_list, columns=["Year", "Tag", "Count"]) return data_frame
def __get_headline_elem(article: ResultSet): # Article can sometimes have no headline, e.g. in Daily Cartoon return article.find(class_="story-block__heading")
class Filter: def __init__(self, user_keys: dict, mobile=False, config=None): if config is None: config = {} self.near = config['near'] if 'near' in config else '' self.dark = config['dark'] if 'dark' in config else False self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False self.alt_redirect = config['alts'] if 'alts' in config else False self.mobile = mobile self.user_keys = user_keys self.main_divs = ResultSet('') self._elements = 0 def __getitem__(self, name): return getattr(self, name) @property def elements(self): return self._elements def reskin(self, page): # Aesthetic only re-skinning if self.dark: page = page.replace('fff', '000').replace('202124', 'ddd').replace('1967D2', '3b85ea') return page def encrypt_path(self, msg, is_element=False): # Encrypts path to avoid plaintext results in logs if is_element: # Element paths are tracked differently in order for the element key to be regenerated # once all elements have been loaded enc_path = Fernet(self.user_keys['element_key']).encrypt(msg.encode()).decode() self._elements += 1 return enc_path return Fernet(self.user_keys['text_key']).encrypt(msg.encode()).decode() def clean(self, soup): self.main_divs = soup.find('div', {'id': 'main'}) self.remove_ads() self.fix_question_section() self.update_styling(soup) for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: self.update_element_src(img, 'image/png') for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: self.update_element_src(audio, 'audio/mpeg') for link in soup.find_all('a', href=True): self.update_link(link) input_form = soup.find('form') if input_form is not None: input_form['method'] = 'POST' # Ensure no extra scripts passed through for script in soup('script'): script.decompose() # Update default footer and header footer = soup.find('footer') if footer: # Remove divs that have multiple links beyond just page navigation [_.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2] header = soup.find('header') if header: header.decompose() return soup def remove_ads(self): if not self.main_divs: return for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: has_ad = len([_ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text)]) _ = div.decompose() if has_ad else None def fix_question_section(self): if not self.main_divs: return question_divs = [_ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0] for question_div in question_divs: questions = [_ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?')] for question in questions: question['style'] = 'padding: 10px; font-style: italic;' def update_element_src(self, element, mime): element_src = element['src'] if element_src.startswith('//'): element_src = 'https:' + element_src elif element_src.startswith(LOGO_URL): # Re-brand with Whoogle logo element['src'] = '/static/img/logo.png' element['style'] = 'height:40px;width:162px' return elif element_src.startswith(GOOG_IMG): element['src'] = BLANK_B64 return element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ '&type=' + urlparse.quote(mime) # TODO: Non-mobile image results link to website instead of image # if not self.mobile: # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) def update_styling(self, soup): # Remove unnecessary button(s) for button in soup.find_all('button'): button.decompose() # Remove svg logos for svg in soup.find_all('svg'): svg.decompose() # Update logo logo = soup.find('a', {'class': 'l'}) if logo and self.mobile: logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \ 'font-size:18px; ' # Fix search bar length on mobile try: search_bar = soup.find('header').find('form').find('div') search_bar['style'] = 'width: 100%;' except AttributeError: pass def update_link(self, link): # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') if '/advanced_search' in href or 'tbm=shop' in href: # TODO: The "Shopping" tab requires further filtering (see #136) # Temporarily removing all links to that tab for now. link.decompose() return elif self.new_tab: link['target'] = '_blank' result_link = urlparse.urlparse(href) query_link = parse_qs(result_link.query)['q'][0] if '?q=' in href else '' if query_link.startswith('/'): # Internal google links (i.e. mail, maps, etc) should still be forwarded to Google link['href'] = 'https://google.com' + query_link elif '/search?q=' in href: # "li:1" implies the query should be interpreted verbatim, so we wrap it in double quotes if 'li:1' in href: query_link = '"' + query_link + '"' new_search = '/search?q=' + self.encrypt_path(query_link) query_params = parse_qs(urlparse.urlparse(href).query) for param in VALID_PARAMS: param_val = query_params[param][0] if param in query_params else '' new_search += '&' + param + '=' + param_val link['href'] = new_search elif 'url?q=' in href: # Strip unneeded arguments link['href'] = filter_link_args(query_link) # Add no-js option if self.nojs: gen_nojs(link) else: link['href'] = href # Replace link location if "alts" config is enabled if self.alt_redirect: # Search and replace all link descriptions with alternative location link['href'] = get_site_alt(link['href']) link_desc = link.find_all(text=re.compile('|'.join(SITE_ALTS.keys()))) if len(link_desc) == 0: return # Replace link destination link_desc[0].replace_with(get_site_alt(link_desc[0]))
def __get_headline_elem(article: ResultSet): headline_elem = article.find('h3') return headline_elem
class Filter: def __init__(self, user_keys: dict, mobile=False, config=None): if config is None: config = {} self.near = config['near'] if 'near' in config else '' self.dark = config['dark'] if 'dark' in config else False self.nojs = config['nojs'] if 'nojs' in config else False self.new_tab = config['new_tab'] if 'new_tab' in config else False self.mobile = mobile self.user_keys = user_keys self.main_divs = ResultSet('') self._elements = 0 def __getitem__(self, name): return getattr(self, name) @property def elements(self): return self._elements def reskin(self, page): # Aesthetic only re-skinning page = page.replace('>G<', '>Wh<') pattern = re.compile('4285f4|ea4335|fbcc05|34a853|fbbc05', re.IGNORECASE) page = pattern.sub('685e79', page) if self.dark: page = page.replace('fff', '000').replace('202124', 'ddd').replace( '1967D2', '3b85ea') return page def encrypt_path(self, msg, is_element=False): # Encrypts path to avoid plaintext results in logs if is_element: # Element paths are tracked differently in order for the element key to be regenerated # once all elements have been loaded enc_path = Fernet(self.user_keys['element_key']).encrypt( msg.encode()).decode() self._elements += 1 return enc_path return Fernet(self.user_keys['text_key']).encrypt( msg.encode()).decode() def clean(self, soup): self.main_divs = soup.find('div', {'id': 'main'}) self.remove_ads() self.fix_question_section() self.update_styling(soup) for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: self.update_element_src(img, 'image/png') for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: self.update_element_src(audio, 'audio/mpeg') for link in soup.find_all('a', href=True): self.update_link(link) input_form = soup.find('form') if input_form is not None: input_form['method'] = 'POST' # Ensure no extra scripts passed through for script in soup('script'): script.decompose() # Update default footer and header footer = soup.find('footer') if footer: # Remove divs that have multiple links beyond just page navigation [ _.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 2 ] header = soup.find('header') if header: header.decompose() return soup def remove_ads(self): if not self.main_divs: return for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: has_ad = len([ _ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text) ]) _ = div.decompose() if has_ad else None def fix_question_section(self): if not self.main_divs: return question_divs = [ _ for _ in self.main_divs.find_all('div', recursive=False) if len(_.find_all('h2')) > 0 ] for question_div in question_divs: questions = [ _ for _ in question_div.find_all('div', recursive=True) if _.text.endswith('?') ] for question in questions: question['style'] = 'padding: 10px; font-style: italic;' def update_element_src(self, element, mime): element_src = element['src'] if element_src.startswith('//'): element_src = 'https:' + element_src elif element_src.startswith(LOGO_URL): # Re-brand with Whoogle logo element['src'] = '/static/img/logo.png' element['style'] = 'height:40px;width:162px' return elif element_src.startswith(GOOG_IMG): element['src'] = BLANK_B64 return element['src'] = '/element?url=' + self.encrypt_path(element_src, is_element=True) + \ '&type=' + urlparse.quote(mime) # TODO: Non-mobile image results link to website instead of image # if not self.mobile: # img.append(BeautifulSoup(FULL_RES_IMG.format(element_src), 'html.parser')) def update_styling(self, soup): # Remove unnecessary button(s) for button in soup.find_all('button'): button.decompose() # Remove svg logos for svg in soup.find_all('svg'): svg.decompose() # Update logo logo = soup.find('a', {'class': 'l'}) if logo and self.mobile: logo['style'] = 'display:flex; justify-content:center; align-items:center; color:#685e79; ' \ 'font-size:18px; ' # Fix search bar length on mobile try: search_bar = soup.find('header').find('form').find('div') search_bar['style'] = 'width: 100%;' except AttributeError: pass # Set up dark mode if active if self.dark: soup.find( 'html' )['style'] = 'scrollbar-color: #333 #111;color:#fff !important;background:#000 !important' for input_element in soup.findAll('input'): input_element['style'] = 'color:#fff;background:#000;' for span_element in soup.findAll('span'): span_element['style'] = 'color: white;' for href_element in soup.findAll('a'): href_element['style'] = 'color: white' if href_element[ 'href'].startswith('/search') else '' def update_link(self, link): # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') if '/advanced_search' in href: link.decompose() return elif self.new_tab: link['target'] = '_blank' result_link = urlparse.urlparse(href) query_link = parse_qs( result_link.query)['q'][0] if '?q=' in href else '' if query_link.startswith('/'): link['href'] = 'https://google.com' + query_link elif '/search?q=' in href: new_search = '/search?q=' + self.encrypt_path(query_link) query_params = parse_qs(urlparse.urlparse(href).query) for param in VALID_PARAMS: param_val = query_params[param][ 0] if param in query_params else '' new_search += '&' + param + '=' + param_val link['href'] = new_search elif 'url?q=' in href: # Strip unneeded arguments link['href'] = filter_link_args(query_link) # Add no-js option if self.nojs: gen_nojs(link) else: link['href'] = href
def __get_headline_elem(article: ResultSet): headline_elem = article.find(class_='headline', recursive=True) return headline_elem
def __init__(self, user_key: str, config: Config, mobile=False) -> None: self.config = config self.mobile = mobile self.user_key = user_key self.main_divs = ResultSet('') self._elements = 0
class Filter: # Limit used for determining if a result is a "regular" result or a list # type result (such as "people also asked", "related searches", etc) RESULT_CHILD_LIMIT = 7 def __init__(self, user_key: str, config: Config, mobile=False) -> None: self.config = config self.mobile = mobile self.user_key = user_key self.main_divs = ResultSet('') self._elements = 0 def __getitem__(self, name): return getattr(self, name) @property def elements(self): return self._elements def encrypt_path(self, path, is_element=False) -> str: # Encrypts path to avoid plaintext results in logs if is_element: # Element paths are encrypted separately from text, to allow key # regeneration once all items have been served to the user enc_path = Fernet(self.user_key).encrypt(path.encode()).decode() self._elements += 1 return enc_path return Fernet(self.user_key).encrypt(path.encode()).decode() def clean(self, soup) -> BeautifulSoup: self.main_divs = soup.find('div', {'id': 'main'}) self.remove_ads() self.remove_block_titles() self.remove_block_url() self.collapse_sections() self.update_styling(soup) for img in [_ for _ in soup.find_all('img') if 'src' in _.attrs]: self.update_element_src(img, 'image/png') for audio in [_ for _ in soup.find_all('audio') if 'src' in _.attrs]: self.update_element_src(audio, 'audio/mpeg') for link in soup.find_all('a', href=True): self.update_link(link) input_form = soup.find('form') if input_form is not None: input_form['method'] = 'GET' if self.config.get_only else 'POST' # Ensure no extra scripts passed through for script in soup('script'): script.decompose() # Update default footer and header footer = soup.find('footer') if footer: # Remove divs that have multiple links beyond just page navigation [ _.decompose() for _ in footer.find_all('div', recursive=False) if len(_.find_all('a', href=True)) > 3 ] header = soup.find('header') if header: header.decompose() return soup def remove_ads(self) -> None: """Removes ads found in the list of search result divs Returns: None (The soup object is modified directly) """ if not self.main_divs: return for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: div_ads = [ _ for _ in div.find_all('span', recursive=True) if has_ad_content(_.text) ] _ = div.decompose() if len(div_ads) else None def remove_block_titles(self) -> None: if not self.main_divs or not self.config.block_title: return block_title = re.compile(self.block_title) for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: block_divs = [ _ for _ in div.find_all('h3', recursive=True) if block_title.search(_.text) is not None ] _ = div.decompose() if len(block_divs) else None def remove_block_url(self) -> None: if not self.main_divs or not self.config.block_url: return block_url = re.compile(self.block_url) for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: block_divs = [ _ for _ in div.find_all('a', recursive=True) if block_url.search(_.attrs['href']) is not None ] _ = div.decompose() if len(block_divs) else None def collapse_sections(self) -> None: """Collapses long result sections ("people also asked", "related searches", etc) into "details" elements These sections are typically the only sections in the results page that have more than ~5 child divs within a primary result div. Returns: None (The soup object is modified directly) """ minimal_mode = read_config_bool('WHOOGLE_MINIMAL') def pull_child_divs(result_div: BeautifulSoup): try: return result_div.findChildren( 'div', recursive=False)[0].findChildren('div', recursive=False) except IndexError: return [] if not self.main_divs: return # Loop through results and check for the number of child divs in each for result in self.main_divs: result_children = pull_child_divs(result) if minimal_mode: if len(result_children) in (1, 3): continue else: if len(result_children) < self.RESULT_CHILD_LIMIT: continue # Find and decompose the first element with an inner HTML text val. # This typically extracts the title of the section (i.e. "Related # Searches", "People also ask", etc) label = 'Collapsed Results' for elem in result_children: if elem.text: label = elem.text elem.decompose() break # Create the new details element to wrap around the result's # first parent parent = None idx = 0 while not parent and idx < len(result_children): parent = result_children[idx].parent idx += 1 details = BeautifulSoup(features='html.parser').new_tag('details') summary = BeautifulSoup(features='html.parser').new_tag('summary') summary.string = label details.append(summary) if parent and not minimal_mode: parent.wrap(details) elif parent and minimal_mode: # Remove parent element from document if "minimal mode" is # enabled parent.decompose() def update_element_src(self, element: Tag, mime: str) -> None: """Encrypts the original src of an element and rewrites the element src to use the "/element?src=" pass-through. Returns: None (The soup element is modified directly) """ src = element['src'] if src.startswith('//'): src = 'https:' + src if src.startswith(LOGO_URL): # Re-brand with Whoogle logo element.replace_with( BeautifulSoup(render_template('logo.html'), features='html.parser')) return elif src.startswith(GOOG_IMG) or GOOG_STATIC in src: element['src'] = BLANK_B64 return element['src'] = f'{Endpoint.element}?url=' + self.encrypt_path( src, is_element=True) + '&type=' + urlparse.quote(mime) def update_styling(self, soup) -> None: # Remove unnecessary button(s) for button in soup.find_all('button'): button.decompose() # Remove svg logos for svg in soup.find_all('svg'): svg.decompose() # Update logo logo = soup.find('a', {'class': 'l'}) if logo and self.mobile: logo['style'] = ('display:flex; justify-content:center; ' 'align-items:center; color:#685e79; ' 'font-size:18px; ') # Fix search bar length on mobile try: search_bar = soup.find('header').find('form').find('div') search_bar['style'] = 'width: 100%;' except AttributeError: pass def update_link(self, link: Tag) -> None: """Update internal link paths with encrypted path, otherwise remove unnecessary redirects and/or marketing params from the url Args: link: A bs4 Tag element to inspect and update Returns: None (the tag is updated directly) """ # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') if 'advanced_search' in href or 'tbm=shop' in href: # FIXME: The "Shopping" tab requires further filtering (see #136) # Temporarily removing all links to that tab for now. link.decompose() return result_link = urlparse.urlparse(href) q = extract_q(result_link.query, href) if q.startswith('/'): # Internal google links (i.e. mail, maps, etc) should still # be forwarded to Google link['href'] = 'https://google.com' + q elif '/search?q=' in href: # "li:1" implies the query should be interpreted verbatim, # which is accomplished by wrapping the query in double quotes if 'li:1' in href: q = '"' + q + '"' new_search = 'search?q=' + self.encrypt_path(q) query_params = parse_qs(urlparse.urlparse(href).query) for param in VALID_PARAMS: if param not in query_params: continue param_val = query_params[param][0] new_search += '&' + param + '=' + param_val link['href'] = new_search elif 'url?q=' in href: # Strip unneeded arguments link['href'] = filter_link_args(q) # Add no-js option if self.config.nojs: append_nojs(link) if self.config.new_tab: link['target'] = '_blank' else: if href.startswith(MAPS_URL): # Maps links don't work if a site filter is applied link['href'] = MAPS_URL + "?q=" + clean_query(q) else: link['href'] = href # Replace link location if "alts" config is enabled if self.config.alts: # Search and replace all link descriptions # with alternative location link['href'] = get_site_alt(link['href']) link_desc = link.find_all( text=re.compile('|'.join(SITE_ALTS.keys()))) if len(link_desc) == 0: return # Replace link description link_desc = link_desc[0] for site, alt in SITE_ALTS.items(): if site not in link_desc: continue new_desc = BeautifulSoup(features='html.parser').new_tag('div') new_desc.string = str(link_desc).replace(site, alt) link_desc.replace_with(new_desc) break def view_image(self, soup) -> BeautifulSoup: """Replaces the soup with a new one that handles mobile results and adds the link of the image full res to the results. Args: soup: A BeautifulSoup object containing the image mobile results. Returns: BeautifulSoup: The new BeautifulSoup object """ # get some tags that are unchanged between mobile and pc versions search_input = soup.find_all('td', attrs={'class': "O4cRJf"})[0] search_options = soup.find_all('div', attrs={'class': "M7pB2"})[0] cor_suggested = soup.find_all('table', attrs={'class': "By0U9"}) next_pages = soup.find_all('table', attrs={'class': "uZgmoc"})[0] information = soup.find_all('div', attrs={'class': "TuS8Ad"})[0] results = [] # find results div results_div = soup.find_all('div', attrs={'class': "nQvrDb"})[0] # find all the results results_all = results_div.find_all('div', attrs={'class': "lIMUZd"}) for item in results_all: urls = item.find('a')['href'].split('&imgrefurl=') # Skip urls that are not two-element lists if len(urls) != 2: continue img_url = urlparse.unquote(urls[0].replace( f'/{Endpoint.imgres}?imgurl=', '')) try: # Try to strip out only the necessary part of the web page link web_page = urlparse.unquote(urls[1].split('&')[0]) except IndexError: web_page = urlparse.unquote(urls[1]) img_tbn = urlparse.unquote(item.find('a').find('img')['src']) results.append({ 'domain': urlparse.urlparse(web_page).netloc, 'img_url': img_url, 'web_page': web_page, 'img_tbn': img_tbn }) soup = BeautifulSoup(render_template('imageresults.html', length=len(results), results=results, view_label="View Image"), features='html.parser') # replace search input object soup.find_all('td', attrs={'class': "O4cRJf"})[0].replaceWith(search_input) # replace search options object (All, Images, Videos, etc.) soup.find_all('div', attrs={'class': "M7pB2"})[0].replaceWith(search_options) # replace correction suggested by google object if exists if len(cor_suggested): soup.find_all('table', attrs={'class': "By0U9"})[0].replaceWith(cor_suggested[0]) # replace next page object at the bottom of the page soup.find_all('table', attrs={'class': "uZgmoc"})[0].replaceWith(next_pages) # replace information about user connection at the bottom of the page soup.find_all('div', attrs={'class': "TuS8Ad"})[0].replaceWith(information) return soup
def __get_item_url(article: ResultSet): return article.find('a', class_="story__link")['href']
def __get_item_author(article: ResultSet): byline = article.find(class_='byline') if byline is not None: return byline.find('a').getText().strip() else: return None
def __get_item_url(article: ResultSet): headline = TheAustralianNewsItemParser.__get_headline_elem(article) if headline is None: return article.find('a')['href'] else: return headline.find('a')['href']
def __get_topic_text(article: ResultSet): topic = article.find(class_="story-block__kicker") if topic is not None: return topic.getText().strip() else: return None
def __get_headline_text(article: ResultSet): return article.find(class_="story__headline__text").getText().strip()
def get_cells(row: element.ResultSet) -> List[str]: """ Gets all th and tr elements within a single tr element """ return [el.text for el in row.find_all(['th', 'td'])]
def __get_headline_text(article: ResultSet): headline_elem = AbcNewsItemParser.__get_headline_elem(article) # Not all articles use a common headline element, if not found get the link text if headline_elem is None: headline_elem = article.find('a') return headline_elem.getText().strip()
def __get_topic_text(article: ResultSet): topic = article.find(class_="topic__string") if topic is not None: return topic.getText().strip() else: return None
def _find_advert_anhor(self, div: element.ResultSet) -> element.Tag: """ Function which search for an anhor in div. """ return div.find('a', {'href': True, 'class': True, 'title': False})